I am looking for a way to have a background thread queue run for an unlimited amount of times. The code below is what I came up with from research, but I am being limited to the amount of threads I create.
On my research I have not been able to figure out a way to just have 1 or 2 threads always available, and running a set function when I add it to the queue.
Goal of this sample app is to check the status of a site every 10 seconds, if the response code is not 200, run the code in the notify function.
What is happening now is the code is working correctly up until I hit the limit of threads I created, 5 in this case. The main while loop keeps working correctly, but the code that needs to execute when there is a failure stops because there are no more threads.
import urllib2, time
from threading import Thread
from Queue import Queue
# Set up global variables
num_threads = 5
queue = Queue()
urlList = [
"http://google.com",
"http://googleeeeeee1111.com"
]
def notify(i, q):
print "Thread %s: started" % i
url = q.get()
print "Thread %s: notification sent for site: %s" % (i, url)
q.task_done()
def make_requests():
while True:
for url in urlList:
try:
request = urllib2.urlopen(url)
responseCode = request.getcode()
# If the response code was 200, do something
if responseCode == 200:
print "URL: %s - Success %d" % (url, responseCode)
else:
print "Bad response code for %s - %d " % (url, responseCode)
queue.put(url)
except Exception, e:
print "ERROR MAKING REQUEST TO %s - %s" % (url, e)
queue.put(url)
time.sleep(10) # wait 10 seconds and start again
if __name__ == '__main__':
# Set up some threads to fetch the enclosures
for i in range(num_threads):
worker = Thread(target=notify, args=(i, queue, ))
worker.setDaemon(True)
worker.start()
make_requests()
Before starting here's the documentation of Python's GIL and how it is affecting threads.
I'm not sure if this is what you're looking for, but you could wrap notify into a never ending loop. I modified your code in order to act like that, + some small corrections that don't have very much to do with the functionality:
import urllib2, time
from threading import Thread
from Queue import Queue
# Set up global variables
num_threads = 3 #you can set it to any number of threads (although 1 would be enough)
queue = Queue()
url_list = [
"http://google.com",
"http://googleeeeeee1111.com"
]
def notify(i, q):
url = q.get()
print "Thread %d: notification sent for site: %s" % (i, url)
q.task_done()
def thread_func(i, q):
print "Thread %d: started" % i
while True:
notify(i, q)
print "Thread %d: ending" % i
def make_requests():
while True:
for url in url_list:
try:
request = urllib2.urlopen(url)
response_code = request.getcode()
# If the response code was 200, do something
if response_code == 200:
print "URL: %s - Success %d" % (url, response_code)
else:
print "Bad response code for %s - %d " % (url, response_code)
queue.put(url)
except Exception as e:
print "ERROR MAKING REQUEST TO %s - %s" % (url, e)
queue.put(url)
time.sleep(10) # wait 10 seconds and start again
if __name__ == "__main__":
# Set up some threads to fetch the enclosures
for i in xrange(num_threads):
worker = Thread(target=thread_func, args=(i, queue))
worker.setDaemon(True)
worker.start()
make_requests()
Related
I have list of a lot of links and I want to use multiprocessing to speed the proccess, here is simplified version, I need it to be ordered like this:
I tried a lot of things, process, pool etc. I always had errors, I need to do it with 4 or 8 threads and make it ordered like this. Thank you for all help. Here is code:
from bs4 import BeautifulSoup
import requests
import time
links = ["http://www.tennisexplorer.com/match-detail/?id=1672704", "http://www.tennisexplorer.com/match-detail/?id=1699387", "http://www.tennisexplorer.com/match-detail/?id=1698990" "http://www.tennisexplorer.com/match-detail/?id=1696623", "http://www.tennisexplorer.com/match-detail/?id=1688719", "http://www.tennisexplorer.com/match-detail/?id=1686305"]
data = []
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
return p1_l + " - " + p2_l + " - " + str(omega)
i = 1
start_time = time.clock()
for link in links:
data.append(essa(link, i))
i += 1
for d in data:
print(d)
print(time.clock() - start_time, "seconds")
Spawn several threads of the function and join them together:
from threading import Thread
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
print p1_l + " - " + p2_l + " - " + str(omega)
if __name__ == '__main__':
threadlist = []
for index, url in enumerate(links):
t= Thread(target=essa,args=(url, index))
t.start()
threadlist.append(t)
for b in threadlist:
b.join()
You wont get them to print in order, for the simple reason that some http responses take longer than others.
As far I can understand you have the list of links and make requests concurrently to make the process faster. Here is the sample code for multithreading. I hope this will help you. Read the documentation for concurrent futures.
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
I am on OSx and I am trying to exit the program by pressing CTRL+C.
but it seems like even if I have a signal handler registered in main thread it doesn't exit while the thread is executing on pressing CTRL+C.
Here is a piece of cake where I am trying to download 3 mp3 files from internet each in separate thread.
import Queue
import urllib2
import os
import signal
import sys
import time
import threading
from socket import error as _SocketError
urls = ["http://broadcast.lds.org/churchmusic/MP3/1/2/nowords/271.mp3",
"http://s1.fans.ge/mp3/201109/08/John_Legend_So_High_Remix(fans_ge).mp3",
"http://megaboon.com/common/preview/track/786203.mp3"]
queue = Queue.Queue()
def do_exit(sigNum, stack):
# handle unix signal recived and exit
sys.stderr.write("Received signal %d " % (sigNum))
raise SystemExit("Exiting")
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, queue, count = 1):
super(ThreadedFetch, self).__init__()
self.queue = queue
def run(self):
while True:
# grabs url of link and path to saveTo and save to lst
host = self.queue.get()
# submit the url for download and location where to save.
self._downloadFile(host[0], host[1])
def _downloadFile(self, url, saveTo=None):
file_name = url.split('/')[-1]
self.setName("Parent_%s_thread" % file_name.split(".")[0])
if not saveTo:
saveTo = '/Users/sanjeevkumar/Desktop'
try:
u = urllib2.urlopen(url)
except urllib2.URLError , er:
print("%s for %s failed to download." % (er.reason, file_name))
self.queue.task_done()
print "Exiting: %s" % self.getName()
except _SocketError , err:
print("%s \n %s failed to download." % (err, file_name))
self.queue.task_done()
else:
th = threading.Thread(
target=self._fileWriteToDisk,
args=(saveTo, u, file_name),
name="fileWrite_Child_of_%s" % self.getName(),
)
# if the user clicks close while the thread is still running,
# then the programme will wait till the save is done,
# then it will close.
th.daemon = False
th.start()
time.sleep(0.1)
print "Writing to disk using child: %s " % th.name
def _fileWriteToDisk(self, saveTo, urlObject, file_name):
path = os.path.join(saveTo, file_name)
try:
f = open(path, 'wb')
except IOError , er:
self.queue.task_done()
print er
return
meta = urlObject.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s : %s " % (file_name, file_size)
file_size_dl = 0
block_sz = 8192
while True:
buffer = urlObject.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
time.sleep(.05)
sys.stdout.flush()
if file_size_dl == file_size:
print r"Download Completed %s%% for file %s, saved to %s" % (file_size_dl * 100. / file_size, file_name, saveTo)
f.close()
# signals to queue job is done
self.queue.task_done()
def main():
# Register signal in main thread
signal.signal(signal.SIGINT, do_exit)
try:
# spawn a pool of threads, and pass them queue instance
for i in range(len(urls)):
t = ThreadedFetch(queue)
t.setDaemon(True)
time.sleep(0.1)
t.start()
urls_saveTo = {urls[0]: None, urls[1]: None, urls[2]: None}
# populate queue with data
for item, value in urls_saveTo.iteritems():
queue.put([item, value])
# wait on the queue until everything has been processed
queue.join()
print '*** Done'
except (KeyboardInterrupt, SystemExit):
print '\n! Received keyboard interrupt, quitting threads.\n'
if __name__ == "__main__":
main()
Feel free to point what else can be improved strategically.
any help would be greatly appreciated.
I've read many answers, however I have not found a proper solution.
The problem, I'm reading mixed/replace HTTP streams that will not expire or end by default.
You can try it by yourself using curl:
curl http://agent.mtconnect.org/sample\?interval\=0
So, now I'm using Python threads and requests to read data from multiple streams.
import requests
import uuid
from threading import Thread
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
thread_id = []
def http_handler(thread_id, url, flag):
print 'Starting task %s' % thread_id
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
print line
if flag and line.endswith('</MTConnectStreams>'):
# Wait until XML message end is reached to receive the full message
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
for task in tasks:
uid = str(uuid.uuid4())
thread_id.append(uid)
t = Thread(target=http_handler, args=(uid, task, False), name=uid)
t.start()
print thread_id
# Wait Time X or until user is doing something
# Send flag = to desired thread to indicate the loop should stop after reaching the end.
Any suggestions? What is the best solution? I don't want to kill the thread because I would like to read the ending to have a full XML message.
I found a solution by using threading module and threading.events. Maybe not the best solution, but it works fine currently.
import logging
import threading
import time
import uuid
import requests
logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s', )
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
d = dict()
def http_handler(e, url):
logging.debug('wait_for_event starting')
message_buffer = []
filter_namespace = True
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
message_buffer.append(line)
if e.isSet() and line.endswith('</MTConnectStreams>'):
logging.debug(len(message_buffer))
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
logging.debug('Waiting before calling Event.set()')
for task in tasks:
uid = str(uuid.uuid4())
e = threading.Event()
d[uid] = {"stop_event": e}
t = threading.Event(uid)
t = threading.Thread(name=uid,
target=http_handler,
args=(e, task))
t.start()
logging.debug('Waiting 3 seconds before calling Event.set()')
for key in d:
time.sleep(3)
logging.debug(threading.enumerate())
logging.debug(d[key])
d[key]['stop_event'].set()
logging.debug('bye')
I'm reading XML events with the requests library as stated in the code below. How do I raise a connection-lost error once the request is started? The Server is emulating a HTTP push / long polling -> http://en.wikipedia.org/wiki/Push_technology#Long_polling and will not end by default.
If there is no new message after 10minutes, the while loop should be exited.
import requests
from time import time
if __name__ == '__main__':
#: Set a default content-length
content_length = 512
try:
requests_stream = requests.get('http://agent.mtconnect.org:80/sample?interval=0', stream=True, timeout=2)
while True:
start_time = time()
#: Read three lines to determine the content-length
for line in requests_stream.iter_lines(3, decode_unicode=None):
if line.startswith('Content-length'):
content_length = int(''.join(x for x in line if x.isdigit()))
#: pause the generator
break
#: Continue the generator and read the exact amount of the body.
for xml in requests_stream.iter_content(content_length):
print "Received XML document with content length of %s in %s seconds" % (len(xml), time() - start_time)
break
except requests.exceptions.RequestException as e:
print('error: ', e)
The server push could be tested with curl via command line:
curl http://agent.mtconnect.org:80/sample\?interval\=0
This might not be the best method, but you can use multiprocessing to run the requests in a separate process.
Something like this should work:
import multiprocessing
import requests
import time
class RequestClient(multiprocessing.Process):
def run(self):
# Write all your code to process the requests here
content_length = 512
try:
requests_stream = requests.get('http://agent.mtconnect.org:80/sample?interval=0', stream=True, timeout=2)
start_time = time.time()
for line in requests_stream.iter_lines(3, decode_unicode=None):
if line.startswith('Content-length'):
content_length = int(''.join(x for x in line if x.isdigit()))
break
for xml in requests_stream.iter_content(content_length):
print "Received XML document with content length of %s in %s seconds" % (len(xml), time.time() - start_time)
break
except requests.exceptions.RequestException as e:
print('error: ', e)
While True:
childProcess = RequestClient()
childProcess.start()
# Wait for 10mins
start_time = time.time()
while time.time() - start_time <= 600:
# Check if the process is still active
if not childProcess.is_alive():
# Request completed
break
time.sleep(5) # Give the system some breathing time
# Check if the process is still active after 10mins.
if childProcess.is_alive():
# Shutdown the process
childProcess.terminate()
raise RuntimeError("Connection Timed-out")
Not the perfect code for your problem, but you get the idea.
The purpose of my program is to download files with threads. I define the unit, and using len/unit threads, the len is the length of the file which is going to be downloaded.
Using my program, the file can be downloaded, but the threads are not stopping. I can't find the reason why.
This is my code...
#! /usr/bin/python
import urllib2
import threading
import os
from time import ctime
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self);
self.func = func;
self.args = args;
self.name = name;
def run(self):
apply(self.func,self.args);
url = 'http://ubuntuone.com/1SHQeCAQWgIjUP2945hkZF';
request = urllib2.Request(url);
response = urllib2.urlopen(request);
meta = response.info();
response.close();
unit = 1000000;
flen = int(meta.getheaders('Content-Length')[0]);
print flen;
if flen%unit == 0:
bs = flen/unit;
else :
bs = flen/unit+1;
blocks = range(bs);
cnt = {};
for i in blocks:
cnt[i]=i;
def getStr(i):
try:
print 'Thread %d start.'%(i,);
fout = open('a.zip','wb');
fout.seek(i*unit,0);
if (i+1)*unit > flen:
request.add_header('Range','bytes=%d-%d'%(i*unit,flen-1));
else :
request.add_header('Range','bytes=%d-%d'%(i*unit,(i+1)*unit-1));
#opener = urllib2.build_opener();
#buf = opener.open(request).read();
resp = urllib2.urlopen(request);
buf = resp.read();
fout.write(buf);
except BaseException:
print 'Error';
finally :
#opener.close();
fout.flush();
fout.close();
del cnt[i];
# filelen = os.path.getsize('a.zip');
print 'Thread %d ended.'%(i),
print cnt;
# print 'progress : %4.2f'%(filelen*100.0/flen,),'%';
def main():
print 'download at:',ctime();
threads = [];
for i in blocks:
t = MyThread(getStr,(blocks[i],),getStr.__name__);
threads.append(t);
for i in blocks:
threads[i].start();
for i in blocks:
# print 'this is the %d thread;'%(i,);
threads[i].join();
#print 'size:',os.path.getsize('a.zip');
print 'download done at:',ctime();
if __name__=='__main__':
main();
Could someone please help me understand why the threads aren't stopping.
I can't really address your code example because it is quite messy and hard to follow, but a potential reason you are seeing the threads not end is that a request will stall out and never finish. urllib2 allows you to specify timeouts for how long you will allow the request to take.
What I would recommend for your own code is that you split your work up into a queue, start a fixed number of thread (instead of a variable number), and let the worker threads pick up work until it is done. Make the http requests have a timeout. If the timeout expires, try again or put the work back into the queue.
Here is a generic example of how to use a queue, a fixed number of workers and a sync primitive between them:
import threading
import time
from Queue import Queue
def worker(queue, results, lock):
local_results = []
while True:
val = queue.get()
if val is None:
break
# pretend to do work
time.sleep(.1)
local_results.append(val)
with lock:
results.extend(local_results)
print threading.current_thread().name, "Done!"
num_workers = 4
threads = []
queue = Queue()
lock = threading.Lock()
results = []
for i in xrange(100):
queue.put(i)
for _ in xrange(num_workers):
# Use None as a sentinel to signal the threads to end
queue.put(None)
t = threading.Thread(target=worker, args=(queue,results,lock))
t.start()
threads.append(t)
for t in threads:
t.join()
print sorted(results)
print "All done"