Python threads are not being garbage collected - python

Here is my threading setup. On my machine the maximum number of threads is 2047.
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print e
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads):
Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
In other classes in my module, I call the ThreadPool class from above to
create a new pool of threads. I then perform operations. Here is an example:
def upload_images(self):
'''batch uploads images to s3 via multi-threading'''
num_threads = min(500, len(pictures))
pool = ThreadPool(num_threads)
for p in pictures:
pool.add_task(p.get_set_upload_img)
pool.wait_completion()
The problem I am having is that these threads are not being garbage collected.
After a few runs, here is my error:
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 495, in start
_start_new_thread(self.__bootstrap, ())
thread.error: can't start new thread
Which means I have hit the thread limit of 2047.
Any ideas? Thanks.

Your worker thread never returns from run, so your thread never ends.
Perhaps something like the following for your run method?
def run(self):
while True:
try:
func, args, kargs = self.tasks.get()
except Queue.Empty:
break
try:
func(*args, **kargs)
except Exception, e:
print e
self.tasks.task_done()

def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print e
self.tasks.task_done()
it looks like an infinite loop, could it be the reason? all threads are alive so they can't be gc collected.

Related

Python multiprocessing Queue child class losing attributes in process

I am trying to implement a child class from the multiprocessing Queue in python. The child class contains a simple Boolean flag "ready". When I send the queue to a new process, the ready attribute is disappearing. The following code demonstrates the problem:
import multiprocessing
import multiprocessing.queues
class ReadyQueue(multiprocessing.queues.Queue):
def __init__(self, ctx, *args, **kwargs):
super(ReadyQueue, self).__init__(ctx=ctx, *args, **kwargs)
self.ready = False
def ready_queue(*args, **kwargs):
return ReadyQueue(ctx=multiprocessing.get_context(), *args, **kwargs)
def foo(q):
print(q.ready)
if __name__ == "__main__":
my_queue = ready_queue()
print(my_queue.ready)
p = multiprocessing.Process(target=foo, args=(my_queue,))
p.start()
p.join()
With the output:
False
Process Process-1:
Traceback (most recent call last):
File "C:\Users\acre018\Anaconda3\envs\EIT_Qt\lib\multiprocessing\process.py", line 315, in _bootstrap
self.run()
File "C:\Users\acre018\Anaconda3\envs\EIT_Qt\lib\multiprocessing\process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\acre018\github\EIT_Qt\Experiments\ready_queue_test.py", line 16, in foo
print(q.ready)
AttributeError: 'ReadyQueue' object has no attribute 'ready'
I implemented this workaround:
import multiprocessing
from queue import Empty
import time
import ctypes
class ReadyQueue:
def __init__(self, *args, **kwargs):
self.queue = multiprocessing.Queue(*args, **kwargs)
self._ready = multiprocessing.Value(ctypes.c_bool, False)
def set_ready(self):
self._ready.value = True
def set_not_ready(self):
self._ready.value = False
self.clear()
def is_ready(self):
return self._ready.value
def clear(self):
try:
while True:
self.queue.get(block=False)
except Empty:
pass
def get(self, block=True, timeout=None):
return self.queue.get(block, timeout)
def put(self, obj, block=True, timeout=None):
return self.queue.put(obj, block, timeout)
def full(self):
return self.queue.full()
def empty(self):
return self.queue.empty()
def qsize(self):
return self.queue.qsize()
def foo(q):
while q.is_ready():
time.sleep(1)
q.put("hello from foo")
print("q no longer ready, foo loop finished")
if __name__ == "__main__":
my_queue = ReadyQueue()
my_queue.set_ready()
p = multiprocessing.Process(target=foo, args=(my_queue,))
p.start()
for i in range(2):
print(my_queue.get())
time.sleep(2)
print("my_queue._ready = %s, qsize: %d. Setting not ready.." % (str(my_queue.is_ready()), my_queue.qsize()))
my_queue.set_not_ready()
print("my_queue._ready = %s, qusize: %d" % (str(my_queue.is_ready()), my_queue.qsize()))
With the output:
C:\Users\acre018\Anaconda3\envs\test_pyqt\python.exe C:/Users/acre018/github/EIT_Qt/Experiments/ready_queue_test2.py
hello from foo
hello from foo
my_queue._ready = True, qsize: 2. Setting not ready..
my_queue._ready = False, qusize: 0
q no longer ready, foo loop finished
Process finished with exit code 0
The workaround is to have my ReadyQueue class not inherit from multiprocessing.queues.Queue but have a queue as an attribute. For convenience I implemented the methods that I need from queue, and they just pass through to the queue attribute. I also implemented a clear method.
Note that in my first example I neglected to make self.ready a multiprocessing.Value, so wouldn't have been able to edit it across processes, but I tested after fixing that and it was not the source of the issue.

Python threading.join() hangs

My problem is as follows:
I have a class that inherits from threading.Thread that I want to be able to stop gracefully. This class also has a Queue it get's its work from.
Since there are quite some classes in my project that should have this behaviour, I've created some superclasses to reduce duplicate code like this:
Thread related behaviour:
class StoppableThread(Thread):
def __init__(self):
Thread.__init__(self)
self._stop = Event()
def stop(self):
self._stop.set()
def stopped(self):
return self._stop.isSet()
Queue related behaviour:
class Queueable():
def __init__(self):
self._queue = Queue()
def append_to_job_queue(self, job):
self._queue.put(job)
Combining the two above and adding queue.join() to the stop() call
class StoppableQueueThread(StoppableThread, Queueable):
def __init__(self):
StoppableThread.__init__(self)
Queueable.__init__(self)
def stop(self):
super(StoppableQueueThread, self).stop()
self._queue.join()
A base class for a datasource:
class DataSource(StoppableThread, ABC):
def __init__(self, data_parser):
StoppableThread.__init__(self)
self.setName("DataSource")
ABC.__init__(self)
self._data_parser = data_parser
def run(self):
while not self.stopped():
record = self._fetch_data()
self._data_parser.append_to_job_queue(record)
#abstractmethod
def _fetch_data(self):
"""implement logic here for obtaining a data piece
should return the fetched data"""
An implementation for a datasource:
class CSVDataSource(DataSource):
def __init__(self, data_parser, file_path):
DataSource.__init__(self, data_parser)
self.file_path = file_path
self.csv_data = Queue()
print('loading csv')
self.load_csv()
print('done loading csv')
def load_csv(self):
"""Loops through csv and adds data to a queue"""
with open(self.file_path, 'r') as f:
self.reader = reader(f)
next(self.reader, None) # skip header
for row in self.reader:
self.csv_data.put(row)
def _fetch_data(self):
"""Returns next item of the queue"""
item = self.csv_data.get()
self.csv_data.task_done()
print(self.csv_data.qsize())
return item
Suppose there is a CSVDataSource instance called ds, if I want to stop the thread I call:
ds.stop()
ds.join()
The ds.join() call however, never returns. I'm not sure why this is, because the run() method does check if the stop event is set.
Any Ideas?
Update
A little more clarity as requested: the applications is build up out of several threads. The RealStrategy thread (below) is the owner of all the other threads and is responsible for starting and terminating them. I haven't set the daemon flag for any of the threads, so they should be non-daemonic by default.
The main thread looks like this:
if __name__ == '__main__':
def exit_handler(signal, frame):
rs.stop_engine()
rs.join()
sys.exit(0)
signal.signal(signal.SIGINT, exit_handler)
rs = RealStrategy()
rs.run_engine()
And here are the rs.run_engine() and rs.stop_engine() methods that are called in main:
class RealStrategy(Thread):
.....
.....
def run_engine(self):
self.on_start()
self._order_handler.start()
self._data_parser.start()
self._data_source.start()
self.start()
def stop_engine(self):
self._data_source.stop()
self._data_parser.stop()
self._order_handler.stop()
self._data_source.join()
self._data_parser.join()
self._order_handler.join()
self.stop()
If you want to use queue.Queue.join, then you must also use queue.Queue.task_done. You can read the linked documentation or see the following copied from information available online:
Queue.task_done()
Indicate that a formerly enqueued task is complete.
Used by queue consumer threads. For each get() used to fetch a task, a
subsequent call to task_done() tells the queue that the processing on
the task is complete.
If a join() is currently blocking, it will resume when all items have
been processed (meaning that a task_done() call was received for every
item that had been put() into the queue).
Raises a ValueError if called more times than there were items placed
in the queue.
Queue.join()
Blocks until all items in the queue have been gotten and processed.
The count of unfinished tasks goes up whenever an item is added to the
queue. The count goes down whenever a consumer thread calls
task_done() to indicate that the item was retrieved and all work on it
is complete. When the count of unfinished tasks drops to zero, join()
unblocks.
To test your problem, an example implementation was created to find out what was going on. It is slightly different from how your program works but demonstrates a method to solving your problem:
#! /usr/bin/env python3
import abc
import csv
import pathlib
import queue
import sys
import threading
import time
def main():
source_path = pathlib.Path(r'C:\path\to\file.csv')
data_source = CSVDataSource(source_path)
data_source.start()
processor = StoppableThread(target=consumer, args=[data_source])
processor.start()
time.sleep(0.1)
data_source.stop()
def consumer(data_source):
while data_source.empty:
time.sleep(0.001)
while not data_source.empty:
task = data_source.get_from_queue(True, 0.1)
print(*task.data, sep=', ', flush=True)
task.done()
class StopThread(StopIteration):
pass
threading.SystemExit = SystemExit, StopThread
class StoppableThread(threading.Thread):
def _bootstrap(self, stop=False):
# noinspection PyProtectedMember
if threading._trace_hook:
raise RuntimeError('cannot run thread with tracing')
def terminate():
nonlocal stop
stop = True
self.__terminate = terminate
# noinspection PyUnusedLocal
def trace(frame, event, arg):
if stop:
raise StopThread
sys.settrace(trace)
super()._bootstrap()
def terminate(self):
try:
self.__terminate()
except AttributeError:
raise RuntimeError('cannot terminate thread '
'before it is started') from None
class Queryable:
def __init__(self, maxsize=1 << 10):
self.__queue = queue.Queue(maxsize)
def add_to_queue(self, item):
self.__queue.put(item)
def get_from_queue(self, block=True, timeout=None):
return self.__queue.get(block, timeout)
#property
def empty(self):
return self.__queue.empty()
#property
def full(self):
return self.__queue.full()
def task_done(self):
self.__queue.task_done()
def join_queue(self):
self.__queue.join()
class StoppableQueryThread(StoppableThread, Queryable):
def __init__(self, target=None, name=None, args=(), kwargs=None,
*, daemon=None, maxsize=1 << 10):
super().__init__(None, target, name, args, kwargs, daemon=daemon)
Queryable.__init__(self, maxsize)
def stop(self):
self.terminate()
self.join_queue()
class DataSource(StoppableQueryThread, abc.ABC):
#abc.abstractmethod
def __init__(self, maxsize=1 << 10):
super().__init__(None, 'DataSource', maxsize=maxsize)
def run(self):
while True:
record = self._fetch_data()
self.add_to_queue(record)
#abc.abstractmethod
def _fetch_data(self):
pass
class CSVDataSource(DataSource):
def __init__(self, source_path):
super().__init__()
self.__data_parser = self.__build_data_parser(source_path)
#staticmethod
def __build_data_parser(source_path):
with source_path.open(newline='') as source:
parser = csv.reader(source)
next(parser, None)
yield from parser
def _fetch_data(self):
try:
return Task(next(self.__data_parser), self.task_done)
except StopIteration:
raise StopThread from None
class Task:
def __init__(self, data, callback):
self.__data = data
self.__callback = callback
#property
def data(self):
return self.__data
def done(self):
self.__callback()
if __name__ == '__main__':
main()

Python multiprocessing: Kill producer and consumer processes with KeyboardInterrupt

I want the customer and producer processes to stop in the following python script if the keyboard shortcut CTRL+C is performed. But the processes do not stop - the keyboard interrupt is not passed to them. Also the except block of the main process is never entered.
import time
import multiprocessing as mp
from multiprocessing.managers import SyncManager
import signal
class Consumer(mp.Process):
def __init__(self, **kwargs):
mp.Process.__init__(self, **kwargs)
def run(self):
proc_name = self.name
try:
while True:
print("{}".format(proc_name))
time.sleep(3)
except KeyboardInterrupt:
print("{} stopped".format(proc_name)) # never printed
return
class Producer(mp.Process):
def __init__(self, **kwargs):
mp.Process.__init__(self, **kwargs)
def run(self):
try:
while True:
time.sleep(3)
print("Producer here.")
except KeyboardInterrupt:
print("Producer stopped.") # never printed
return
def main():
def __init_worker():
signal.signal(signal.SIGINT, signal.SIG_IGN)
print('init') # not printed!!??
# manager = SyncManager() # does not change anything
# manager.start(__init_worker)
consumers = [Consumer(target=__init_worker) for i in xrange(3)]
producer = Producer(target=__init_worker)
producer.daemon = True # does not change anything
producer.start()
for c in consumers:
c.daemon = True
c.start()
try:
producer.join()
for c in consumers:
c.join()
except Exception as e:
print('STOP') # never printed
raise e
if __name__ == '__main__':
main()
There might be also a solution for my task by using multiprocesing.Pool for the customers and let the main process work as producer, but I would like to know why my implementation is not working as it is intended to and what I need to adjust.
I realised that __init_worker seems to be not executed (makes no difference if it is located outside of main). Maybe the reason for not passing KeyboardInterrupt to the customer and producer processes?
Based on eryksun's comments I improved my code and use multiprocessing.Event now. And the script is now working like expected. I also removed some lines, which I think are not necessary any more. Since I did not find any similar solution when searching the web, here my code comes:
import time
import multiprocessing as mp
class Consumer(mp.Process):
def __init__(self, quit_event, **kwargs):
mp.Process.__init__(self, **kwargs)
self.quit_event = quit_event
def run(self):
proc_name = self.name
while not self.quit_event.is_set():
print("{}".format(proc_name))
time.sleep(3)
print("{} stopped".format(proc_name))
return
class Producer(mp.Process):
def __init__(self, quit_event, **kwargs):
mp.Process.__init__(self, **kwargs)
self.quit_event = quit_event
def run(self):
while not self.quit_event.is_set():
print("Producer here.")
time.sleep(3)
print("Producer stopped")
return
def main():
quit_event = mp.Event()
consumers = [Consumer(quit_event) for i in xrange(3)]
producer = Producer(quit_event)
producer.start()
for c in consumers:
c.start()
try:
producer.join()
for c in consumers:
c.join()
except KeyboardInterrupt as e:
print('\nSTOP')
quit_event.set()
except Exception as e:
quit_event.set()
raise e
finally:
producer.terminate()
producer.join()
for c in consumers:
c.terminate()
c.join()
if __name__ == '__main__':
main()
Hoping, that it helps somebody.
Edit: Swaped terminate and join statements.

Exiting a Thread in Python

I'm trying to write a program that crawls through a website and download all the videos it has. I'm facing a problem that the number of threads continuously increases even after the downloading of individual videos are done.
Here is the code for the individual Worker object, which is queued and then joined later. This is the only part of the code at which I generate a Thread. What I don't understand is how there can be remaining threads if given the object, I implement the self.stop() function and the while loop breaks.
class Worker(Thread):
def __init__(self, thread_pool):
Thread.__init__(self)
self.tasks = thread_pool.tasks
self.tasks_info = thread_pool.tasks_info
self.daemon = True
self._is_running=True
self.start()
def stop(self):
self._is_running = False
def run(self):
while self._is_running:
func, args, kargs = self.tasks.get()
try: func(*args, **kargs)
except Exception:
print("\nError: Threadpool error.")
sys.exit(1)
self.tasks_info['num_tasks_complete'] += 1
self.tasks.task_done()
self.stop()
I've used the thread functions to check which threads are alive, and it turns out that it is indeed mostly the worker functions as well as other objects called Thread(SockThread) and _MainThread, which I do not know how to close.
Please advise on 1. why the Worker thread is not ending and 2. how to get rid of the Thread(SockThread) as well as the _MainThread.
Thank you!
edit 1
class ThreadPool:
def __init__(self, name, num_threads, num_tasks):
self.tasks = Queue(num_threads)
self.num_threads=num_threads
self.tasks_info = {
'name': name,
'num_tasks': num_tasks,
'num_tasks_complete': 0
}
for _ in range(num_threads):
Worker(self)
print(threading.active_count)
def add_task(self, func, *args, **kwargs):
self.tasks.put((func, args, kwargs))
def wait_completion(self):
print("at the beginning of wait_completion:")
print(threading.active_count())
By looking at your code it looks like you have initialized the thread which calls the run() method for processing. After that you're even using the start method which is not the proper way. Your code should be as follows:
from threading import Event
class Worker(Thread):
def __init__(self, thread_pool):
self.tasks = thread_pool.tasks
self.tasks_info = thread_pool.tasks_info
self.exit = Event()
super(Thread,self).__init__()
def shutdown(self):
self.exit.set()
def run(self):
while not self.exit.is_set():
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception:
print("\nError: Threadpool error.")
# use shutdown method for error
self.shutdown()
sys.exit(1)
self.tasks_info['num_tasks_complete'] += 1
self.tasks.task_done()
self.shutdown()

Checking a lot of URLs to see if they return 200. What's the cleverest way?

I need to check a lot (~10 million) of URLs to see if they exist (return 200). I've written the following code to do this per-URL, but to do all of the URLs will take approximately forever.
def is_200(url):
try:
parsed = urlparse(url)
conn = httplib.HTTPConnection(parsed.netloc)
conn.request("HEAD", parsed.path)
res = conn.getresponse()
return res.status == 200
except KeyboardInterrupt, e:
raise e
except:
return False
The URLs are spread across about a dozen hosts, so it seems like I should be able to take advantage of this to pipeline my requests and reduce connection overhead. How would you build this? I'm open to any programming/scripting language.
Have a look at urllib3. It supports per-host connection re-using.
Additionally using multiple processes/threads or async I/O would be a good idea.
All of this is in Python, version 3.x.
I would create worker threads that check for 200. I'll give an example. The threadpool (put in threadpool.py):
# http://code.activestate.com/recipes/577187-python-thread-pool/
from queue import Queue
from threading import Thread
class Worker(Thread):
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try: func(*args, **kargs)
except Exception as exception: print(exception)
self.tasks.task_done()
class ThreadPool:
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
self.tasks.put((func, args, kargs))
def wait_completion(self):
self.tasks.join()
Now, if urllist contains your urls then your main file should be along the lines of this:
numconns = 40
workers = threadpool.ThreadPool(numconns)
results = [None] * len(urllist)
def check200(url, index):
results[index] = is_200(url)
for index, url in enumerate(urllist):
try:
workers.add_task(check200, url, index)
except KeyboardInterrupt:
print("Shutting down application, hang on...")
workers.wait_completion()
break
Note that this program scales with the other suggestions posted here, this is only dependent on is_200().

Categories

Resources