I'm trying to write a program that crawls through a website and download all the videos it has. I'm facing a problem that the number of threads continuously increases even after the downloading of individual videos are done.
Here is the code for the individual Worker object, which is queued and then joined later. This is the only part of the code at which I generate a Thread. What I don't understand is how there can be remaining threads if given the object, I implement the self.stop() function and the while loop breaks.
class Worker(Thread):
def __init__(self, thread_pool):
Thread.__init__(self)
self.tasks = thread_pool.tasks
self.tasks_info = thread_pool.tasks_info
self.daemon = True
self._is_running=True
self.start()
def stop(self):
self._is_running = False
def run(self):
while self._is_running:
func, args, kargs = self.tasks.get()
try: func(*args, **kargs)
except Exception:
print("\nError: Threadpool error.")
sys.exit(1)
self.tasks_info['num_tasks_complete'] += 1
self.tasks.task_done()
self.stop()
I've used the thread functions to check which threads are alive, and it turns out that it is indeed mostly the worker functions as well as other objects called Thread(SockThread) and _MainThread, which I do not know how to close.
Please advise on 1. why the Worker thread is not ending and 2. how to get rid of the Thread(SockThread) as well as the _MainThread.
Thank you!
edit 1
class ThreadPool:
def __init__(self, name, num_threads, num_tasks):
self.tasks = Queue(num_threads)
self.num_threads=num_threads
self.tasks_info = {
'name': name,
'num_tasks': num_tasks,
'num_tasks_complete': 0
}
for _ in range(num_threads):
Worker(self)
print(threading.active_count)
def add_task(self, func, *args, **kwargs):
self.tasks.put((func, args, kwargs))
def wait_completion(self):
print("at the beginning of wait_completion:")
print(threading.active_count())
By looking at your code it looks like you have initialized the thread which calls the run() method for processing. After that you're even using the start method which is not the proper way. Your code should be as follows:
from threading import Event
class Worker(Thread):
def __init__(self, thread_pool):
self.tasks = thread_pool.tasks
self.tasks_info = thread_pool.tasks_info
self.exit = Event()
super(Thread,self).__init__()
def shutdown(self):
self.exit.set()
def run(self):
while not self.exit.is_set():
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception:
print("\nError: Threadpool error.")
# use shutdown method for error
self.shutdown()
sys.exit(1)
self.tasks_info['num_tasks_complete'] += 1
self.tasks.task_done()
self.shutdown()
Related
I am trying to implement a child class from the multiprocessing Queue in python. The child class contains a simple Boolean flag "ready". When I send the queue to a new process, the ready attribute is disappearing. The following code demonstrates the problem:
import multiprocessing
import multiprocessing.queues
class ReadyQueue(multiprocessing.queues.Queue):
def __init__(self, ctx, *args, **kwargs):
super(ReadyQueue, self).__init__(ctx=ctx, *args, **kwargs)
self.ready = False
def ready_queue(*args, **kwargs):
return ReadyQueue(ctx=multiprocessing.get_context(), *args, **kwargs)
def foo(q):
print(q.ready)
if __name__ == "__main__":
my_queue = ready_queue()
print(my_queue.ready)
p = multiprocessing.Process(target=foo, args=(my_queue,))
p.start()
p.join()
With the output:
False
Process Process-1:
Traceback (most recent call last):
File "C:\Users\acre018\Anaconda3\envs\EIT_Qt\lib\multiprocessing\process.py", line 315, in _bootstrap
self.run()
File "C:\Users\acre018\Anaconda3\envs\EIT_Qt\lib\multiprocessing\process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\acre018\github\EIT_Qt\Experiments\ready_queue_test.py", line 16, in foo
print(q.ready)
AttributeError: 'ReadyQueue' object has no attribute 'ready'
I implemented this workaround:
import multiprocessing
from queue import Empty
import time
import ctypes
class ReadyQueue:
def __init__(self, *args, **kwargs):
self.queue = multiprocessing.Queue(*args, **kwargs)
self._ready = multiprocessing.Value(ctypes.c_bool, False)
def set_ready(self):
self._ready.value = True
def set_not_ready(self):
self._ready.value = False
self.clear()
def is_ready(self):
return self._ready.value
def clear(self):
try:
while True:
self.queue.get(block=False)
except Empty:
pass
def get(self, block=True, timeout=None):
return self.queue.get(block, timeout)
def put(self, obj, block=True, timeout=None):
return self.queue.put(obj, block, timeout)
def full(self):
return self.queue.full()
def empty(self):
return self.queue.empty()
def qsize(self):
return self.queue.qsize()
def foo(q):
while q.is_ready():
time.sleep(1)
q.put("hello from foo")
print("q no longer ready, foo loop finished")
if __name__ == "__main__":
my_queue = ReadyQueue()
my_queue.set_ready()
p = multiprocessing.Process(target=foo, args=(my_queue,))
p.start()
for i in range(2):
print(my_queue.get())
time.sleep(2)
print("my_queue._ready = %s, qsize: %d. Setting not ready.." % (str(my_queue.is_ready()), my_queue.qsize()))
my_queue.set_not_ready()
print("my_queue._ready = %s, qusize: %d" % (str(my_queue.is_ready()), my_queue.qsize()))
With the output:
C:\Users\acre018\Anaconda3\envs\test_pyqt\python.exe C:/Users/acre018/github/EIT_Qt/Experiments/ready_queue_test2.py
hello from foo
hello from foo
my_queue._ready = True, qsize: 2. Setting not ready..
my_queue._ready = False, qusize: 0
q no longer ready, foo loop finished
Process finished with exit code 0
The workaround is to have my ReadyQueue class not inherit from multiprocessing.queues.Queue but have a queue as an attribute. For convenience I implemented the methods that I need from queue, and they just pass through to the queue attribute. I also implemented a clear method.
Note that in my first example I neglected to make self.ready a multiprocessing.Value, so wouldn't have been able to edit it across processes, but I tested after fixing that and it was not the source of the issue.
My problem is as follows:
I have a class that inherits from threading.Thread that I want to be able to stop gracefully. This class also has a Queue it get's its work from.
Since there are quite some classes in my project that should have this behaviour, I've created some superclasses to reduce duplicate code like this:
Thread related behaviour:
class StoppableThread(Thread):
def __init__(self):
Thread.__init__(self)
self._stop = Event()
def stop(self):
self._stop.set()
def stopped(self):
return self._stop.isSet()
Queue related behaviour:
class Queueable():
def __init__(self):
self._queue = Queue()
def append_to_job_queue(self, job):
self._queue.put(job)
Combining the two above and adding queue.join() to the stop() call
class StoppableQueueThread(StoppableThread, Queueable):
def __init__(self):
StoppableThread.__init__(self)
Queueable.__init__(self)
def stop(self):
super(StoppableQueueThread, self).stop()
self._queue.join()
A base class for a datasource:
class DataSource(StoppableThread, ABC):
def __init__(self, data_parser):
StoppableThread.__init__(self)
self.setName("DataSource")
ABC.__init__(self)
self._data_parser = data_parser
def run(self):
while not self.stopped():
record = self._fetch_data()
self._data_parser.append_to_job_queue(record)
#abstractmethod
def _fetch_data(self):
"""implement logic here for obtaining a data piece
should return the fetched data"""
An implementation for a datasource:
class CSVDataSource(DataSource):
def __init__(self, data_parser, file_path):
DataSource.__init__(self, data_parser)
self.file_path = file_path
self.csv_data = Queue()
print('loading csv')
self.load_csv()
print('done loading csv')
def load_csv(self):
"""Loops through csv and adds data to a queue"""
with open(self.file_path, 'r') as f:
self.reader = reader(f)
next(self.reader, None) # skip header
for row in self.reader:
self.csv_data.put(row)
def _fetch_data(self):
"""Returns next item of the queue"""
item = self.csv_data.get()
self.csv_data.task_done()
print(self.csv_data.qsize())
return item
Suppose there is a CSVDataSource instance called ds, if I want to stop the thread I call:
ds.stop()
ds.join()
The ds.join() call however, never returns. I'm not sure why this is, because the run() method does check if the stop event is set.
Any Ideas?
Update
A little more clarity as requested: the applications is build up out of several threads. The RealStrategy thread (below) is the owner of all the other threads and is responsible for starting and terminating them. I haven't set the daemon flag for any of the threads, so they should be non-daemonic by default.
The main thread looks like this:
if __name__ == '__main__':
def exit_handler(signal, frame):
rs.stop_engine()
rs.join()
sys.exit(0)
signal.signal(signal.SIGINT, exit_handler)
rs = RealStrategy()
rs.run_engine()
And here are the rs.run_engine() and rs.stop_engine() methods that are called in main:
class RealStrategy(Thread):
.....
.....
def run_engine(self):
self.on_start()
self._order_handler.start()
self._data_parser.start()
self._data_source.start()
self.start()
def stop_engine(self):
self._data_source.stop()
self._data_parser.stop()
self._order_handler.stop()
self._data_source.join()
self._data_parser.join()
self._order_handler.join()
self.stop()
If you want to use queue.Queue.join, then you must also use queue.Queue.task_done. You can read the linked documentation or see the following copied from information available online:
Queue.task_done()
Indicate that a formerly enqueued task is complete.
Used by queue consumer threads. For each get() used to fetch a task, a
subsequent call to task_done() tells the queue that the processing on
the task is complete.
If a join() is currently blocking, it will resume when all items have
been processed (meaning that a task_done() call was received for every
item that had been put() into the queue).
Raises a ValueError if called more times than there were items placed
in the queue.
Queue.join()
Blocks until all items in the queue have been gotten and processed.
The count of unfinished tasks goes up whenever an item is added to the
queue. The count goes down whenever a consumer thread calls
task_done() to indicate that the item was retrieved and all work on it
is complete. When the count of unfinished tasks drops to zero, join()
unblocks.
To test your problem, an example implementation was created to find out what was going on. It is slightly different from how your program works but demonstrates a method to solving your problem:
#! /usr/bin/env python3
import abc
import csv
import pathlib
import queue
import sys
import threading
import time
def main():
source_path = pathlib.Path(r'C:\path\to\file.csv')
data_source = CSVDataSource(source_path)
data_source.start()
processor = StoppableThread(target=consumer, args=[data_source])
processor.start()
time.sleep(0.1)
data_source.stop()
def consumer(data_source):
while data_source.empty:
time.sleep(0.001)
while not data_source.empty:
task = data_source.get_from_queue(True, 0.1)
print(*task.data, sep=', ', flush=True)
task.done()
class StopThread(StopIteration):
pass
threading.SystemExit = SystemExit, StopThread
class StoppableThread(threading.Thread):
def _bootstrap(self, stop=False):
# noinspection PyProtectedMember
if threading._trace_hook:
raise RuntimeError('cannot run thread with tracing')
def terminate():
nonlocal stop
stop = True
self.__terminate = terminate
# noinspection PyUnusedLocal
def trace(frame, event, arg):
if stop:
raise StopThread
sys.settrace(trace)
super()._bootstrap()
def terminate(self):
try:
self.__terminate()
except AttributeError:
raise RuntimeError('cannot terminate thread '
'before it is started') from None
class Queryable:
def __init__(self, maxsize=1 << 10):
self.__queue = queue.Queue(maxsize)
def add_to_queue(self, item):
self.__queue.put(item)
def get_from_queue(self, block=True, timeout=None):
return self.__queue.get(block, timeout)
#property
def empty(self):
return self.__queue.empty()
#property
def full(self):
return self.__queue.full()
def task_done(self):
self.__queue.task_done()
def join_queue(self):
self.__queue.join()
class StoppableQueryThread(StoppableThread, Queryable):
def __init__(self, target=None, name=None, args=(), kwargs=None,
*, daemon=None, maxsize=1 << 10):
super().__init__(None, target, name, args, kwargs, daemon=daemon)
Queryable.__init__(self, maxsize)
def stop(self):
self.terminate()
self.join_queue()
class DataSource(StoppableQueryThread, abc.ABC):
#abc.abstractmethod
def __init__(self, maxsize=1 << 10):
super().__init__(None, 'DataSource', maxsize=maxsize)
def run(self):
while True:
record = self._fetch_data()
self.add_to_queue(record)
#abc.abstractmethod
def _fetch_data(self):
pass
class CSVDataSource(DataSource):
def __init__(self, source_path):
super().__init__()
self.__data_parser = self.__build_data_parser(source_path)
#staticmethod
def __build_data_parser(source_path):
with source_path.open(newline='') as source:
parser = csv.reader(source)
next(parser, None)
yield from parser
def _fetch_data(self):
try:
return Task(next(self.__data_parser), self.task_done)
except StopIteration:
raise StopThread from None
class Task:
def __init__(self, data, callback):
self.__data = data
self.__callback = callback
#property
def data(self):
return self.__data
def done(self):
self.__callback()
if __name__ == '__main__':
main()
I am trying to combine the answers I got from two different python questions.
Here is the the first question and answer. Basically I just wanted to spawn two threads, one to powerDown() and the other to powerUp(), where powerUp() pends on powerDown()
How to spawn a thread inside another thread in the same object in python?
import threading
class Server(threading.Thread):
# some code
def run(self):
self.reboot()
# This is the top level function called by other objects
def reboot(self):
# perhaps add a lock
if not hasattr(self, "_down"):
self._down = threading.Thread(target=self.__powerDown)
self._down.start()
up = threading.Thread(target=self.__powerUp)
up.start()
def __powerDown(self):
# do something
def __powerUp(self):
if not hasattr(self, "_down"):
return
self._down.join()
# do something
del self._down
Here is the the second question and answer. Basically I wanted to start a thread, and then call a function of the object.
How to call a function on a running Python thread
import queue
import threading
class SomeClass(threading.Thread):
def __init__(self, q, loop_time = 1.0/60):
self.q = q
self.timeout = loop_time
super(SomeClass, self).__init__()
def onThread(self, function, *args, **kwargs):
self.q.put((function, args, kwargs))
def run(self):
while True:
try:
function, args, kwargs = self.q.get(timeout=self.timeout)
function(*args, **kwargs)
except queue.Empty:
self.idle()
def idle(self):
# put the code you would have put in the `run` loop here
def doSomething(self):
pass
def doSomethingElse(self):
pass
Here is combined idea code. Basically I wanted to spawn a thread, then then queue up a functions to execute, which in this case is reboot(). reboot() in turns creates two threads, the powerDown() and powerUp() threads, where powerDown() pends on powerUp()
import threading
import Queue
class Server(threading.Thread):
def __init__(self, q, loop_time = 1.0/60):
self.q = q
self.timeout = loop_time
super(Server, self).__init__()
def run(self):
while True:
try:
function, args, kwargs = self.q.get(timeout=self.timeout)
function(*args, **kwargs)
except queue.Empty:
self.idle()
def idle(self):
# put the code you would have put in the `run` loop here
# This is the top level function called by other objects
def reboot(self):
self.__onthread(self.__reboot)
def __reboot(self):
if not hasattr(self, "_down"):
self._down = threading.Thread(target=self.__powerDown)
self._down.start()
up = threading.Thread(target=self.__powerUp)
up.start()
def __onThread(self, function, *args, **kwargs):
self.q.put((function, args, kwargs))
def __powerDown(self):
# do something
def __powerUp(self):
if not hasattr(self, "_down"):
return
self._down.join()
# do something
del self._down
All work, except when I create two Server subclasses.
class ServerA(Server):
pass
class ServerB(Server):
pass
Here is the code that instatiats both subclasses, and call the start() and reboot functions
serverA = ServerA(None)
serverB = ServerB(None)
serverA.start()
serverB.start()
serverA.reboot()
serverB.reboot()
I expect serverA.reboot() and serverB.reboot() to happen concurrently, which is what I want, but they DO NOT! serverB.reboot() gets executed after serverA.reboot() is done. That is, if I put print statements, I get
serverA started
serverB started
serverA.reboot() called
serverA.__powerDown called
serverA.__powerUp called
serverB.reboot() called
serverB.__powerDown called
serverB.__powerUp called
I know for a fact that it takes longer for ServerA to reboot, so I expect something like this
serverA started
serverB started
serverA.reboot() called
serverB.reboot() called
serverA.__powerDown called
serverB.__powerDown called
serverB.__powerUp called
serverA.__powerUp called
I hope that makes sense. If it does, why aren't my reboot() functions happening simultaneously?
Why are you sending None while you are expecting a queue object in the first place ? This causes an exception which complains that None type object doesn't have a get method. Besides that the exception you want to be handled in the run method is Queue.Empty and not queue.Empty.
Here is the revised code and its output on my machine:
import threading
import Queue
class Server(threading.Thread):
def __init__(self, title, q, loop_time = 1.0/60):
self.title = title
self.q = q
self.timeout = loop_time
super(Server, self).__init__()
def run(self):
print "%s started" % self.title
while True:
try:
function, args, kwargs = self.q.get(timeout=self.timeout)
function(*args, **kwargs)
except Queue.Empty:
# print "empty"
self.idle()
def idle(self):
pass
# put the code you would have put in the `run` loop here
# This is the top level function called by other objects
def reboot(self):
self.__onThread(self.__reboot)
def __reboot(self):
if not hasattr(self, "_down"):
self._down = threading.Thread(target=self.__powerDown)
self._down.start()
up = threading.Thread(target=self.__powerUp)
up.start()
def __onThread(self, function, *args, **kwargs):
self.q.put((function, args, kwargs))
def __powerDown(self):
# do something
print "%s power down" % self.title
pass
def __powerUp(self):
print "%s power up" % self.title
if not hasattr(self, "_down"):
return
self._down.join()
# do something
del self._down
class ServerA(Server):
pass
class ServerB(Server):
pass
def main():
serverA = ServerA("A", Queue.Queue())
serverB = ServerB("B", Queue.Queue())
serverA.start()
serverB.start()
serverA.reboot()
serverB.reboot()
if __name__ == '__main__':
main()
Output:
A started
B started
B power down
A power down
B power up
A power up
Here is my threading setup. On my machine the maximum number of threads is 2047.
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print e
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads):
Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
In other classes in my module, I call the ThreadPool class from above to
create a new pool of threads. I then perform operations. Here is an example:
def upload_images(self):
'''batch uploads images to s3 via multi-threading'''
num_threads = min(500, len(pictures))
pool = ThreadPool(num_threads)
for p in pictures:
pool.add_task(p.get_set_upload_img)
pool.wait_completion()
The problem I am having is that these threads are not being garbage collected.
After a few runs, here is my error:
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 495, in start
_start_new_thread(self.__bootstrap, ())
thread.error: can't start new thread
Which means I have hit the thread limit of 2047.
Any ideas? Thanks.
Your worker thread never returns from run, so your thread never ends.
Perhaps something like the following for your run method?
def run(self):
while True:
try:
func, args, kargs = self.tasks.get()
except Queue.Empty:
break
try:
func(*args, **kargs)
except Exception, e:
print e
self.tasks.task_done()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print e
self.tasks.task_done()
it looks like an infinite loop, could it be the reason? all threads are alive so they can't be gc collected.
I need to check a lot (~10 million) of URLs to see if they exist (return 200). I've written the following code to do this per-URL, but to do all of the URLs will take approximately forever.
def is_200(url):
try:
parsed = urlparse(url)
conn = httplib.HTTPConnection(parsed.netloc)
conn.request("HEAD", parsed.path)
res = conn.getresponse()
return res.status == 200
except KeyboardInterrupt, e:
raise e
except:
return False
The URLs are spread across about a dozen hosts, so it seems like I should be able to take advantage of this to pipeline my requests and reduce connection overhead. How would you build this? I'm open to any programming/scripting language.
Have a look at urllib3. It supports per-host connection re-using.
Additionally using multiple processes/threads or async I/O would be a good idea.
All of this is in Python, version 3.x.
I would create worker threads that check for 200. I'll give an example. The threadpool (put in threadpool.py):
# http://code.activestate.com/recipes/577187-python-thread-pool/
from queue import Queue
from threading import Thread
class Worker(Thread):
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try: func(*args, **kargs)
except Exception as exception: print(exception)
self.tasks.task_done()
class ThreadPool:
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
self.tasks.put((func, args, kargs))
def wait_completion(self):
self.tasks.join()
Now, if urllist contains your urls then your main file should be along the lines of this:
numconns = 40
workers = threadpool.ThreadPool(numconns)
results = [None] * len(urllist)
def check200(url, index):
results[index] = is_200(url)
for index, url in enumerate(urllist):
try:
workers.add_task(check200, url, index)
except KeyboardInterrupt:
print("Shutting down application, hang on...")
workers.wait_completion()
break
Note that this program scales with the other suggestions posted here, this is only dependent on is_200().