Queue Python threading (Segmentation fault: 11) - python

I have a queue set up like this. I want to run through all the items in a DB query and pass them into my Downloader class. My DB connection keeps going away and eventually my program just dies because I think too many threads are open?
I get an error: Segmentation fault: 11
There are 100K+ items.
How can I properly fix this to process a few items at a time and speed up the process?
class Downloader(threading.Thread):
"""Threaded File Downloader"""
def __init__(self, queue, db):
threading.Thread.__init__(self)
self.queue = queue
self.db = db
def remove_unicode(self, title):
try:
return unicodedata.normalize('NFKD', title).encode('ascii','ignore')
except:
return title
def run(self):
while True:
# gets the url from the queue
row = self.queue.get()
title = row[0]
etc...
def main(urls):
queue = Queue.Queue()
# create a thread pool and give them a queue
for i in range(5):
t = Downloader(queue, db)
t.setDaemon(True)
t.start()
# give the queue some data
i = 1
for url in urls:
print i
queue.put(url)
i+=1
# wait for the queue to finish
queue.join()
if __name__ == "__main__":
db = DatabaseUtil()
sql = 'SELECT `Title`, `Site` from `XYZ`'
titles = db.query(sql)
main(titles)

Related

Send data to Python thread, then read a response using Queue

It's quite easy to send or receive data through threads using Queue's module when doing each thing at a time, but I didn't figure out how to send something to a thread, then expect for a return properly.
In the below example, I was expecting to send something to thread in order to be processed, then harvest the result, but the t.queue.get() in the main function receives what what just sent above instead of waiting for the thread to return. How can I get around it?
#!/usr/bin/env python3
from threading import Thread
from queue import Queue
class MyThread(Thread):
queue:Queue
def __init__(self, *args, **kwargs):
super().__init__()
self.queue = Queue()
self.daemon = True
# receives a name, then prints "Hello, name!"
def run(self):
while True:
val = self.queue.get()
if not val:
break
self.queue.put(f'Hello, {val}!')
def main():
t = MyThread()
t.start()
# sends string to thread
t.queue.put('Jurandir')
# expects to receive "Hello, Jurandir!",
# but "Jurandir" is immediately returned
ret = t.queue.get()
print(ret)
if __name__ == '__main__':
main()
Thing is that you are getting the alleged result immediately from the queue, and the worker has still not added the result. You can split into an "input queue" and a "results queue". And then wait in the main thread until there's some output in the queue.
#!/usr/bin/env python3
from threading import Thread, Lock
from queue import Queue
class MyThread(Thread):
def __init__(self, *args, **kwargs):
super().__init__()
self.input_queue = Queue()
self.results_queue = Queue()
self.daemon = True
# receives a name, then prints "Hello, name!"
def run(self):
while True:
val = self.input_queue.get()
if not val:
break
self.results_queue.put(f'Hello, {val}!')
def main():
t = MyThread()
t.start()
# sends string to thread
t.input_queue.put('Jurandir')
ret = t.results_queue.get()
while ret is None:
ret = t.results_queue.get()
print(ret)
if __name__ == '__main__':
main()

Avoid waiting for threads to finish in Python

I've wrote this script here to read data from a txt file and process it. But it seems that if I give it a big file and a high number of threads, the more it reads from the list, the slower the script gets.
Is there a way to avoid waiting for all the threads to finish and start a new one whenever a thread is done with the work?
Also it seems that when it finishes processing, the script doesn't exit.
import threading, Queue, time
class Work(threading.Thread):
def __init__(self, jobs):
threading.Thread.__init__(self)
self.Lock = threading.Lock()
self.jobs = jobs
def myFunction(self):
#simulate work
self.Lock.acquire()
print("Firstname: "+ self.firstname + " Lastname: "+ self.lastname)
self.Lock.release()
time.sleep(3)
def run(self):
while True:
self.item = self.jobs.get().rstrip()
self.firstname = self.item.split(":")[0]
self.lastname = self.item.split(":")[1]
self.myFunction()
self.jobs.task_done()
def main(file):
jobs = Queue.Queue()
myList = open(file, "r").readlines()
MAX_THREADS = 10
pool = [Work(jobs) for i in range(MAX_THREADS)]
for thread in pool:
thread.start()
for item in myList:
jobs.put(item)
for thread in pool:
thread.join()
if __name__ == '__main__':
main('list.txt')
The script probably seems to take longer on larger inputs because there's a 3 second pause between each batch of printing.
The issue with the script not finishing is, since you are using Queue, you need to call join() on the Queue, not on the individual threads. To make sure that the script returns when the jobs have stopped running, you should also set daemon = True.
The Lock will also not work in the current code because threading.Lock() produces a new lock each time. You need to have all the jobs share the same lock.
If you want to use this in Python 3 (which you should), the Queue module has been renamed to queue.
import threading, Queue, time
lock = threading.Lock() # One lock
class Work(threading.Thread):
def __init__(self, jobs):
threading.Thread.__init__(self)
self.daemon = True # set daemon
self.jobs = jobs
def myFunction(self):
#simulate work
lock.acquire() # All jobs share the one lock
print("Firstname: "+ self.firstname + " Lastname: "+ self.lastname)
self.Lock.release()
time.sleep(3)
def run(self):
while True:
self.item = self.jobs.get().rstrip()
self.firstname = self.item.split(":")[0]
self.lastname = self.item.split(":")[1]
self.myFunction()
self.jobs.task_done()
def main(file):
jobs = Queue.Queue()
with open(file, 'r') as fp: # Close the file when we're done
myList = fp.readlines()
MAX_THREADS = 10
pool = [Work(jobs) for i in range(MAX_THREADS)]
for thread in pool:
thread.start()
for item in myList:
jobs.put(item)
jobs.join() # Join the Queue
if __name__ == '__main__':
main('list.txt')
Simpler example (based on an example from the Python docs)
import threading
import time
from Queue import Queue # Py2
# from queue import Queue # Py3
lock = threading.Lock()
def worker():
while True:
item = jobs.get()
if item is None:
break
firstname, lastname = item.split(':')
lock.acquire()
print("Firstname: " + firstname + " Lastname: " + lastname)
lock.release()
time.sleep(3)
jobs.task_done()
jobs = Queue()
pool = []
MAX_THREADS = 10
for i in range(MAX_THREADS):
thread = threading.Thread(target=worker)
thread.start()
pool.append(thread)
with open('list.txt') as fp:
for line in fp:
jobs.put(line.rstrip())
# block until all tasks are done
jobs.join()
# stop workers
for i in range(MAX_THREADS):
jobs.put(None)
for thread in pool:
thread.join()

Regularly check whether a webserver is up with a Thread

I wrote a Threading class which tests whether a webserver is up or not.
import urllib
import threading
import time
import Queue
class Thread_CheckDeviceState(threading.Thread):
def __init__(self, device_ip, queue, inter=0.1):
self._run = True
self._codes = {}
self._queue = queue
self._device_ip = device_ip
self._inter = inter
self._elapsed = 0
threading.Thread.__init__(self)
def stop(self):
self._run = False
def run(self):
start = time.time()
while self._run:
try:
code = urllib.urlopen(self._device_ip).getcode()
except Exception:
code = "nope"
finally:
measure = time.time()
self._elapsed += measure-start
print self._elapsed, code
self._codes.update(
{self._elapsed:code}
)
time.sleep(self._inter)
self._queue.put(self._codes)
q = Queue.Queue()
thread = Thread_CheckDeviceState("http://192.168.1.3", q)
thread.start()
time.sleep(10)
thread.stop()
print q.get()
It works fine - until I disconnect my pc from the network. From that moment on the thread just does nothing until it is stopped. I would expect it to just continue and set the code to "nope", like I wrote it in the exception handler. Why doesn't it work
You need to use urllib2 instead, and specify a timeout parameter when you call urlopen().

Python multiprocessing exiting cleanly

I've got a daemon that runs a number of child processes intended to maintain a telnet connection to collect data from a bunch of weather stations. I've set it up so that these child processes read from that telnet connection forever, passing the weather readings back to the parent process via a multiprocessing.Queue. I can't seem to get these child processes to exit cleanly when I stop the daemon with ./test.py stop. Is there an easy way to close the child processes on exit? A quick google mentioned someone using multiprocessing.Event, what's the best way to set this event on exit to ensure the processes exit? Here's our current code:
from daemon import runner
from multiprocessing import Process, Queue
import telnetlib
from django.utils.encoding import force_text
from observations.weather.models import WeatherStation
import os
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
def read_weather_data(name, ip_address, port, queue):
print "Started process to get data for", name
client = telnetlib.Telnet(ip_address, port)
while True:
response = client.read_until('\r\n'.encode('utf8'))
queue.put((name, force_text(response)))
client.close()
class App(object):
def __init__(self):
self.stdin_path = '/dev/null'
self.stdout_path = '/dev/tty'
self.stderr_path = '/dev/tty'
self.pidfile_path = '/tmp/process_weather.pid'
self.pidfile_timeout = 5
def run(self):
queue = Queue()
for station in WeatherStation.objects.filter(active=True):
p = Process(target=read_weather_data,
args=(station.name, station.ip_address, station.port,
queue,))
p.start()
while True:
name, data = queue.get()
print "Received data from ", name
print data
app = App()
daemon_runner = runner.DaemonRunner(app)
daemon_runner.do_action()
Seem to have found a way to do this, but am unsure about whether this is the best approach to take.
from daemon import runner
from multiprocessing import Process, Queue, Event
import telnetlib
from django.utils.encoding import force_text
from observations.weather.models import WeatherStation
import os
import signal
import errno
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
def read_weather_data(name, ip_address, port, queue, exit):
print "Started process to get data for", name
client = telnetlib.Telnet(ip_address, port)
while not exit.is_set():
response = client.read_until('\r\n'.encode('utf8'))
queue.put((name, force_text(response)))
print "exit called for", name
client.close()
def exit_handler(signum, frame):
print "exiting..."
class App(object):
def __init__(self):
self.stdin_path = '/dev/null'
self.stdout_path = '/dev/tty'
self.stderr_path = '/dev/tty'
self.pidfile_path = '/tmp/process_weather.pid'
self.pidfile_timeout = 5
def run(self):
exit = Event()
def exit_handler(signum, frame):
print "exiting..."
exit.set()
signal.signal(signal.SIGTERM, exit_handler)
queue = Queue()
workers = []
for station in WeatherStation.objects.filter(active=True):
p = Process(target=read_weather_data,
args=(station.name, station.ip_address, station.port,
queue, exit))
workers.append(p)
for worker in workers:
worker.start()
while True:
try:
name, data = queue.get()
except IOError as e:
# we received a signal whilst waiting for I/O
if e.errno != errno.EINTR:
raise
else:
break
print "Received data from ", name
print data
for worker in workers:
worker.join()
app = App()
daemon_runner = runner.DaemonRunner(app)
daemon_runner.do_action()

python can't start a new thread

I am building a multi threading application.
I have setup a threadPool.
[ A Queue of size N and N Workers that get data from the queue]
When all tasks are done I use
tasks.join()
where tasks is the queue .
The application seems to run smoothly until suddently at some point (after 20 minutes in example) it terminates with the error
thread.error: can't start new thread
Any ideas?
Edit: The threads are daemon Threads and the code is like:
while True:
t0 = time.time()
keyword_statuses = DBSession.query(KeywordStatus).filter(KeywordStatus.status==0).options(joinedload(KeywordStatus.keyword)).with_lockmode("update").limit(100)
if keyword_statuses.count() == 0:
DBSession.commit()
break
for kw_status in keyword_statuses:
kw_status.status = 1
DBSession.commit()
t0 = time.time()
w = SWorker(threads_no=32, network_server='http://192.168.1.242:8180/', keywords=keyword_statuses, cities=cities, saver=MySqlRawSave(DBSession), loglevel='debug')
w.work()
print 'finished'
When the daemon threads are killed?
When the application finishes or when the work() finishes?
Look at the thread pool and the worker (it's from a recipe )
from Queue import Queue
from threading import Thread, Event, current_thread
import time
event = Event()
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
'''Start processing tasks from the queue'''
while True:
event.wait()
#time.sleep(0.1)
try:
func, args, callback = self.tasks.get()
except Exception, e:
print str(e)
return
else:
if callback is None:
func(args)
else:
callback(func(args))
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, args=None, callback=None):
''''Add a task to the queue'''
self.tasks.put((func, args, callback))
def wait_completion(self):
'''Wait for completion of all the tasks in the queue'''
self.tasks.join()
def broadcast_block_event(self):
'''blocks running threads'''
event.clear()
def broadcast_unblock_event(self):
'''unblocks running threads'''
event.set()
def get_event(self):
'''returns the event object'''
return event
ALSo maybe the problem it's because I create SWorker objects in a loop?
What happens with the old SWorker (garbage collection ?) ?
There is still not enough code for localize the problem, but I'm sure that this is because you don't utilize the threads and start too much of them. Did you see canonical example from Queue python documentation http://docs.python.org/library/queue.html (bottom of the page)?
I can reproduce your problem with the following code:
import threading
import Queue
q = Queue.Queue()
def worker():
item = q.get(block=True) # sleeps forever for now
do_work(item)
q.task_done()
# create infinite number of workers threads and fails
# after some time with "error: can't start new thread"
while True:
t = threading.Thread(target=worker)
t.start()
q.join() # newer reached this
Instead you must create the poll of threads with known number of threads and put your data to queue like:
q = Queue()
def worker():
while True:
item = q.get()
do_work(item)
q.task_done()
for i in range(num_worker_threads):
t = Thread(target=worker)
t.daemon = True
t.start()
for item in source():
q.put(item)
q.join() # block until all tasks are done
UPD: In case you need to stop some thread, you can add a flag to it or send a special mark means "stop" for break while loop:
class Worker(Thread):
break_msg = object() # just uniq mark sign
def __init__(self):
self.continue = True
def run():
while self.continue: # can stop and destroy thread, (var 1)
msg = queue.get(block=True)
if msg == self.break_msg:
return # will stop and destroy thread (var 2)
do_work()
queue.task_done()
workers = [Worker() for _ in xrange(num_workers)]
for w in workers:
w.start()
for task in tasks:
queue.put(task)
for _ in xrange(num_workers):
queue.put(Worker.break_msg) # stop thread after all tasks done. Need as many messages as many threads you have
OR
queue.join() # wait until all tasks done
for w in workers:
w.continue = False
w.put(None)

Categories

Resources