How to Quit program when all the thread have been finished? - python

#!/usr/bin/env python
import threading
import urllib, sys,os
import Queue
concurrent = 200
queue = Queue.Queue(concurrent*2)
try:
aim = sys.argv[1].lower()
dic = open(sys.argv[2],'r')
except:
print "Usage: %s url wordlist" % sys.argv[0]
sys.exit(1)
class Scanner(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.queue=queue
def run(self):
while True:
self.path = self.queue.get()
self.geturl = urllib.urlopen(aim+'/'+self.path)
self.status = self.geturl.getcode()
self.url = aim+self.path
self.result = self.url+'=>'+str(self.status)
print self.result
self.writeresult(self.result)
self.queue.task_done()
def writeresult(self,result):
fp = open('result.txt','a+')
fp.write(result+'\n')
fp.close()
def main():
for i in range(concurrent):
t = Scanner(queue)
t.setDaemon(True)
t.start()
for path in dic.readlines():
queue.put(path.strip())
queue.join()
if __name__ == '__main__':
main()
It is a python program to scan the dir of the website, when the scanning finish,
it even not quit with the ctrl+c
i want to know when it finish the scanning how to quit the program automatically.
and when it is in process, it also appear some problem like this:
Exception in thread Thread-130:
Traceback (most recent call last):
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "tt.py", line 28, in run
self.geturl = urllib.urlopen(aim+'/'+self.path)
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 86, in urlopen
return opener.open(url)
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 207, in open
return getattr(self, name)(url)
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 344, in open_http
h.endheaders(data)
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 954, in endheaders
self._send_output(message_body)
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 814, in _send_output
self.send(msg)
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 776, in send
self.connect()
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 757, in connect
self.timeout, self.source_address)
File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 8] nodename nor servname provided, or not known

I wanted some practice so I tried this out and changed a lot. Does it get you a full set of results? You will need to replace paths with your original argument reading.
With those threads, maybe you are getting unhandled exceptions resulting in missing results? I added a mechanism to catch any errors during reading and pass those to the result writer.
I guess appending from multiple threads to a file is ok, but I added a writer thread to more cleanly manage the file
most of the assignments to self were unnecessary
if you still get socket errors, check the paths in the result file and see how you want to handle those results if at all
I'm no expert, so don't take this as best practice
import threading
import urllib
import Queue
concurrent = 5
aim = 'http://edition.cnn.com'
paths = ['2013/10/12/opinion/kazin-tea-party/index.html?hpt=hp_t5',
'2013/10/11/opinion/opinion-hay-nobel-opcw/index.html?hpt=hp_t5',
'2013/10/11/opinion/rosin-women-in-charge/index.html?hpt=hp_t5',
'some invalid path',
'2013'] # also an invalid path
def main():
work_q = Queue.Queue()
result_q = Queue.Queue()
# start the scanners and the result writer
scanners = [Scanner(work_q, result_q) for i in range(concurrent)]
for s in scanners:
s.start()
results_file_path = 'results.txt'
result_writer = ResultWriter(result_q, 'results.txt')
result_writer.start()
# send all the work and wait for it to be completed
for path in paths:
work_q.put(path.strip())
work_q.join()
# tell everyone to stop
# you could just kill the threads but you writer needs to close the file
for s in scanners:
work_q.put(Scanner.STOP_TOKEN)
result_q.put(ResultWriter.STOP_TOKEN) # make sure file gets closed
# wait for everyone to actually stop
for s in scanners:
s.join()
result_writer.join()
print 'the scan has finished and results are in {}'.format(results_file_path)
class Scanner(threading.Thread):
STOP_TOKEN = '<<stop>>'
def __init__(self, work_q, result_q):
threading.Thread.__init__(self)
self.work_q = work_q
self.result_q = result_q
def run(self):
while True:
path = status = None # reset in case of error
try:
try:
path = self.work_q.get(timeout=0.00001)
except Queue.Empty:
continue
if path == self.STOP_TOKEN:
break # stop looking for work
get_url = urllib.urlopen(aim + '/' + path)
status = get_url.getcode()
except Exception as e:
status = 'unhandled error ({})'.format(e)
self.result_q.put((path, status))
self.work_q.task_done()
class ResultWriter(threading.Thread):
STOP_TOKEN = '<<stop>>'
def __init__(self, result_q, results_file_path):
threading.Thread.__init__(self)
self.result_q = result_q
self.results_file_path = results_file_path
def run(self):
with open(self.results_file_path, 'w') as results_file:
while True:
try:
result = self.result_q.get(timeout=0.00001)
except Queue.Empty:
continue
if result == self.STOP_TOKEN:
break # stop looking for results
path, status = result
results_file.write('{}=>{}\n'.format(path, status))
if __name__ == '__main__':
main()

The program as it is, it will close when all threads have finished.
But to easily get rid of all those errors, in your function run, from the class, after the while True: claus, put everything that follows in a try: except: clause like this
try:
code
except:
pass
Its not exactly the cleanest way to do it, but considering what you are after, it will do the job, and will get you rid of those exceptions, which btw mean that some URLS have been timed out.

Related

Gracefully terminate multiprocessing based program

I am working on a python service that spawns Process to handle the workload. Since I don't know at the start of the service how many workers I need, I chose to not use Pool. The following is a simplified version:
import multiprocessing as mp
import time
from datetime import datetime
def _print(s): # just my cheap logging utility
print(f'{datetime.now()} - {s}')
def run_in_process(q, evt):
_print(f'starting process job')
while not evt.is_set(): # True
try:
x = q.get(timeout=2)
_print(f'received {x}')
except:
_print(f'timed-out')
if __name__ == '__main__':
with mp.Manager() as manager:
q = manager.Queue()
evt = manager.Event()
p = mp.Process(target=run_in_process, args=(q, evt))
p.start()
time.sleep(2)
data = 100
while True:
try:
q.put(data)
time.sleep(0.5)
data += 1
if data > 110:
break
except KeyboardInterrupt:
_print('finishing...')
#p.terminate()
break
time.sleep(3)
_print('setting event 0')
evt.set()
_print('joining process')
p.join()
_print('done')
The program works and exits gracefully, without any error messages. However, if I use Ctrl-C before I have all 10 events processed, I get the following error before it exits.
2022-04-01 12:41:06.866484 - received 101
2022-04-01 12:41:07.367628 - received 102
^C2022-04-01 12:41:07.507805 - timed-out
2022-04-01 12:41:07.507886 - finishing...
Process Process-2:
Traceback (most recent call last):
File "/<path-omitted>/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/<path-omitted>/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "mp.py", line 10, in run_in_process
while not evt.is_set(): # True
File "/<path-omitted>/python3.7/multiprocessing/managers.py", line 1088, in is_set
return self._callmethod('is_set')
File "/<path-omitted>/python3.7/multiprocessing/managers.py", line 819, in _callmethod
kind, result = conn.recv()
File "/<path-omitted>/python3.7/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/<path-omitted>/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/<path-omitted>/python3.7/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
2022-04-01 12:41:10.511334 - setting event 0
Traceback (most recent call last):
File "mp.py", line 42, in <module>
evt.set()
File "/<path-omitted>/python3.7/multiprocessing/managers.py", line 1090, in set
return self._callmethod('set')
File "/<path-omitted>/python3.7/multiprocessing/managers.py", line 818, in _callmethod
conn.send((self._id, methodname, args, kwds))
File "/<path-omitted>/python3.7/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/<path-omitted>/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
self._send(header + buf)
File "/<path-omitted>/python3.7/multiprocessing/connection.py", line 368, in _send
n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
A few observations:
The double error message looks exactly the same when I press Ctrl-C with my actual project. I think this is a good representation of my problem.
If I add p.terminate(), it doesn't change the behavior if the program is left to finish by itself. But if I press Ctrl-C halfway, I encounter the error message only once, I guess it's from the main thread/process.
If I change while not evt.is_set(): in run_in_process to an infinite loop: while Tre: and let the program finish its course I would continue to see periodic time-out prints which make sense. What I don't understand is that, if I press Ctrl-C, then the terminal will start spewing time-out without any time gap between them. What happened?
My ultimate question is: what is the correct way of construct this program so that when Ctrl-C is used (or a termination signal is generated to the program for that matter), the program stops gracefully?
I found out a solution to this problem myself by using signal.
The idea is to set up a signal catcher to catch specific signals, such as signal.SIGINT, signal.SIGTERM.
import multiprocessing as mp
from threading import Event
import signal
if __name__ == '__main__':
main_evt = Event()
def stop_main_handler(signum, frame):
if not main_evt.is_set():
main_evt.set()
signal.signal(signal.SIGINT, stop_main_handler)
with mp.Manager() as manager:
# creating mp queue, event and process
q = manager.Queue()
evt = manager.Event()
p = mp.Process(target=..., args=(q, evt))
p.start()
while not main_evt.is_set():
# processing data
# cleanup
evt.set()
p.join()
Or you can wrap it in an object-oriented fashion:
class SignalCatcher(object):
def __init__(self):
self._main_evt = Event()
def _stop_handler(self, signum, frame):
if not self._main_evt.is_set():
self._main_evt.set()
def block_until_signaled(self):
while not self._main_evt.is_set()
time.sleep(2)
Then you can use it as follows:
if __name__ == '__main__':
sc = SignalCatcher()
# this has to be outside. It seems that there is another process
# created by multiprocessing library, if you put sc creation in
# with-context, it would fail to signal each process.
with mp.Manager() as manager:
# creating process and starting it
# ...
sc.block_until_signaled()
# cleanup
# ...

How to send a message from the server to the client in a web socket

Well I'm doing a little program that aims to receive messages on rabbitmq and send to clients connected to the websocket that has the same index, but having a problem when sending to the client, I reused a code that had to work in 3.6 .9 and I don't remember the version of the tornado (websocket library that I use) but I changed the pc and I'm able to install it again, now I have the newest versions of the library and python.
I'll post my old code because it's easier to understand because of the msm error
import tornado.websocket
import tornado.ioloop
import threading
import pika
import json
def verificar_novo(se):
for i in range(0,len(conexao_lista)):
if se == conexao_lista[i]["endereco"]:
return 0
return 1
def excluir_conexao(endereco):
for i in range(0,len(conexao_lista)):
if conexao_lista[i]["endereco"] == endereco:
del(conexao_lista[i])
break
""" Função para procurar mensagens no rabbit e retornar para os clientes"""
def callback(ch, method, properties, body):
menssagem_rabbit = json.loads(body)
threading.Lock()
for i in range(0, len(conexao_lista)):
if (conexao_lista[i]["configuracao"]["ras_eve_id_indice"]) == (menssagem_rabbit["ras_eve_id_indice"]):
conexao_lista[i]["endereco"].write_message(menssagem_rabbit)
break
threading.RLock()
""" Classe de conexao com cliente"""
class WebSocketHandler(tornado.websocket.WebSocketHandler):
def open(self):
print("Novo cliente conectado")
def on_close(self):
print("Cliente desconectado")
excluir_conexao(self)
def on_message(self, message):
n = verificar_novo(self)
if n == 0:
self.write_message(u"Sua menssagem: " + message)
else:
dados_json = json.loads(message)
conexao_dicionario["endereco"] = self
conexao_dicionario["configuracao"] = dados_json
conexao_lista.append(conexao_dicionario.copy())
self.write_message(u"Usuario conectado " + dados_json["id_usuario"])
def check_origin(self, origin):
return True
"""Função que a thread ficara rodando para consumir as mensagem em segundo plano"""
def procurar_mensagens():
connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
channel = connection.channel()
channel.basic_consume(queue='testerenan', on_message_callback=callback, auto_ack=True)
channel.start_consuming()
"""Variaveis"""
conexao_lista = []
conexao_dicionario = {"endereco": "","configuracao":""}
"""Chamando a Thread"""
threading.Thread(target=procurar_mensagens, args=()).start()
"""Conexão do WebSocket"""
application = tornado.web.Application([(r"/", WebSocketHandler),])
if __name__ == "__main__":
application.listen(8888)
tornado.ioloop.IOLoop.instance().start()
The error that appears:
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/usr/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/renan/Área de Trabalho/Projeto-WebSocket/Servidor.py", line 63, in procurar_mensagens
channel.start_consuming()
File "/usr/local/lib/python3.8/dist-packages/pika/adapters/blocking_connection.py", line 1866, in start_consuming
self._process_data_events(time_limit=None)
File "/usr/local/lib/python3.8/dist-packages/pika/adapters/blocking_connection.py", line 2027, in _process_data_events
self.connection.process_data_events(time_limit=time_limit)
File "/usr/local/lib/python3.8/dist-packages/pika/adapters/blocking_connection.py", line 834, in process_data_events
self._dispatch_channel_events()
File "/usr/local/lib/python3.8/dist-packages/pika/adapters/blocking_connection.py", line 566, in _dispatch_channel_events
impl_channel._get_cookie()._dispatch_events()
File "/usr/local/lib/python3.8/dist-packages/pika/adapters/blocking_connection.py", line 1493, in _dispatch_events
consumer_info.on_message_callback(self, evt.method,
File "/home/renan/Área de Trabalho/Projeto-WebSocket/Servidor.py", line 26, in callback
conexao_lista[i]["endereco"].write_message(menssagem_rabbit)
File "/home/renan/.local/lib/python3.8/site-packages/tornado/websocket.py", line 342, in write_message
return self.ws_connection.write_message(message, binary=binary)
File "/home/renan/.local/lib/python3.8/site-packages/tornado/websocket.py", line 1098, in write_message
fut = self._write_frame(True, opcode, message, flags=flags)
File "/home/renan/.local/lib/python3.8/site-packages/tornado/websocket.py", line 1075, in _write_frame
return self.stream.write(frame)
File "/home/renan/.local/lib/python3.8/site-packages/tornado/iostream.py", line 555, in write
future = Future() # type: Future[None]
File "/usr/lib/python3.8/asyncio/events.py", line 639, in get_event_loop
raise RuntimeError('There is no current event loop in thread %r.'
RuntimeError: There is no current event loop in thread 'Thread-1'.
I'll leave the project for download if anyone wants to take a look:
https://github.com/Renan-Sacca/Projeto-WebSocket
In general, you have to be very careful when mixing threads and Tornado - you can't call most tornado methods from other threads (this has always been true, but the library got more strict about enforcing it in Tornado 5.0). In particular this includes write_message. So in callback, instead of calling write_message, you have to ask the IOLoop to call it for you.
In the main block, do global main_io_loop; main_io_loop = IOLoop.current() to save the main threads' IOLoop so you can refer to it later. Then in callback replace the call to write_message with
main_io_loop.add_callback(conexao_lista[i]["endereco"].write_message, menssagem_rabbit)

Try block not catching - Am I making inadvertent internet access?

I accidentally disconnected my internet connection and received this error below. However, why did this line trigger the error?
self.content += tuple(subreddit_posts)
Or perhaps I should ask, why did the following line not lead to a sys.exit? It seems it should catch all errors:
try:
subreddit_posts = self.r.get_content(url, limit=10)
except:
print '*** Could not connect to Reddit.'
sys.exit()
Does this mean I am inadvertently hitting reddit's network twice?
FYI, praw is a reddit API client. And get_content() fetches a subreddit's posts/submissons as a generator object.
The error message:
Traceback (most recent call last):
File "beam.py", line 49, in <module>
main()
File "beam.py", line 44, in main
scan.scanNSFW()
File "beam.py", line 37, in scanNSFW
map(self.getSub, self.nsfw)
File "beam.py", line 26, in getSub
self.content += tuple(subreddit_posts)
File "/Library/Python/2.7/site-packages/praw/__init__.py", line 504, in get_co
page_data = self.request_json(url, params=params)
File "/Library/Python/2.7/site-packages/praw/decorators.py", line 163, in wrap
return_value = function(reddit_session, *args, **kwargs)
File "/Library/Python/2.7/site-packages/praw/__init__.py", line 557, in reques
retry_on_error=retry_on_error)
File "/Library/Python/2.7/site-packages/praw/__init__.py", line 399, in _reque
_raise_response_exceptions(response)
File "/Library/Python/2.7/site-packages/praw/internal.py", line 178, in _raise
response.raise_for_status()
File "/Library/Python/2.7/site-packages/requests/models.py", line 831, in rais
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 503 Server Error: Service Unavailable
The script (it's short):
import sys, os, pprint, praw
class Scanner(object):
''' A scanner object. '''
def __init__(self):
self.user_agent = 'debian.22990.myapp'
self.r = praw.Reddit(user_agent=self.user_agent)
self.nsfw = ('funny', 'nsfw')
self.nsfw_posters = set()
self.content = ()
def getSub(self, subreddit):
''' Accepts a subreddit. Connects to subreddit and retrieves content.
Unpacks generator object containing content into tuple. '''
url = 'http://www.reddit.com/r/{sub}/'.format(sub=subreddit)
print 'Scanning:', subreddit
try:
subreddit_posts = self.r.get_content(url, limit=10)
except:
print '*** Could not connect to Reddit.'
sys.exit()
print 'Constructing list.',
self.content += tuple(subreddit_posts)
print 'Done.'
def addNSFWPoster(self, post):
print 'Parsing author and adding to posters.'
self.nsfw_posters.add(str(post.author))
def scanNSFW(self):
''' Scans all NSFW subreddits. Makes list of posters.'''
# Get content from all nsfw subreddits
print 'Executing map function.'
map(self.getSub, self.nsfw)
# Scan content and get authors
print 'Executing list comprehension.'
[self.addNSFWPoster(post) for post in self.content]
def main():
scan = Scanner()
scan.scanNSFW()
for i in scan.nsfw_posters:
print i
print len(scan.content)
main()
It looks like praw is going to lazily get objects, so when you actually use subreddit_posts is when the request gets made, which explains why it's blowing up on that line.
See: https://praw.readthedocs.org/en/v2.1.20/pages/lazy-loading.html

Getting error: [Errno 10053] while trying to send a file in http response

Am trying to send a big file in http response by writing into wfile variable of BaseHTTPRequestHandler in Python, when I am trying to do that I am ending with the below exception in my Python code always.
error: [Errno 10053] An established connection was aborted by the software in your machine
Can any one help me to resolve this?? why am getting the error?
If the way am sending large file in HTTP response is not the good one, please suggest where I can refer.
Thanks in advance!!!
import os
import urlparse
import BaseHTTPServer
from SocketServer import ThreadingMixIn
import urlparse
class Handler(BaseHTTPServer.BaseHTTPRequestHandler):
def handle(self):
BaseHTTPServer.BaseHTTPRequestHandler.handle(self)
def sendError(self, errorCode, errorMessage):
self.send_response(errorCode, errorMessage)
self.send_header("Content-type", "text/plain")
self.send_header("Content-Length", str(len(errorMessage)))
self.end_headers()
self.wfile.write(errorMessage)
def do_GET(self):
scm, netloc, path, params, query, fragment = urlparse.urlparse(self.path, 'http')
if path.find(".ld") > 0:
filename = path.rpartition("/")[2]
try:
with open(filename, 'rb') as f:
self.send_response(200, "Ok")
self.send_header("Content-type","application/octet-stream")
total_size = os.path.getsize(filename)
self.send_header("Content-Length", total_size)
self.end_headers()
self.wfile.write(f.read())
except IOError:
self.sendError(404, "Not Found")
class ThreadedHTTPServer(ThreadingMixIn, BaseHTTPServer.HTTPServer):
def __init__(self, server_address, RequestHandlerClass, bind_and_activate=True):
BaseHTTPServer.HTTPServer.__init__(self, server_address, RequestHandlerClass, bind_and_activate)
def main():
Handler.close_connection = 0
Handler.protocol_version = 'HTTP/1.1'
global httpd
httpd = ThreadedHTTPServer(("", 8900), Handler)
httpd.daemon_threads = True
httpd.serve_forever()
if __name__ == "__main__":
main()
Error Trace:
Exception happened during processing of request from ('172.24.128.21', 19418)
Traceback (most recent call last):
File "C:\Python27\lib\SocketServer.py", line 593, in process_request_thread
self.finish_request(request, client_address)
File "C:\Python27\lib\SocketServer.py", line 334, in finish_request
self.RequestHandlerClass(request, client_address, self)
File "C:\Python27\lib\SocketServer.py", line 649, in __init__
self.handle()
File "simple.py", line 10, in handle
BaseHTTPServer.BaseHTTPRequestHandler.handle(self)
File "C:\Python27\lib\BaseHTTPServer.py", line 342, in handle
self.handle_one_request()
File "C:\Python27\lib\BaseHTTPServer.py", line 310, in handle_one_request
self.raw_requestline = self.rfile.readline(65537)
File "C:\Python27\lib\socket.py", line 476, in readline
data = self._sock.recv(self._rbufsize)
error: [Errno 10054] An existing connection was forcibly closed by the remote ho
st
OK, so I tried your code after cleaning it up a bit:
def do_GET(self):
_, _, path, _, _, _ = urlparse.urlparse(self.path, 'http')
if path.find(".ld") > 0:
filename = path.rpartition("/")[2]
try:
with open(filename, 'rb') as f:
self.send_response(200, "Ok")
self.send_header("Content-type", self.headers.getheader("Content-type", ""))
total_size = os.path.getsize(filename)
self.send_header("Content-Length", total_size)
self.end_headers()
self.wfile.write(f.read())
except IOError:
self.sendError(404, "Not Found")
I put a foo.pl file in the same folder; then, when doing curl http://localhost/foo.pl, I get a nice response with the content of the file, and no errors whatsoever.
I must also say that it looks like you're just trying to get the file name without the path part using path.rpartition("/")[2], but for that, you should just use os.path.basename:
>>> os.path.basename('foo/bar/baz.pl')
'baz.pl'
Also, path.find(".ld") > 0 should probably be path.endswith(".ld") instead.
EDIT: to support large files (efficiently):
CHUNK_SIZE = 1024 * 100 # 100kB chunks
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
self.wfile.write(chunk)

How do I restart the ioloop in tornado when fetching twitter stream api?

I'm using TweetStream (https://github.com/joshmarshall/TweetStream), a tornado based twitter streaming module to monitor the stream api.
I would like to know how can I restart the fetch process if want to change the tracked words.
My current solution (not exactly a solution) is giving me some errors.
stream = tweetstream.TweetStream(configuration,ioloop=main_io_loop)
stream.fetch("/1.1/statuses/filter.json?track="+tornado.escape.url_escape(words), callback=callback)
def check_words():
global words
with open('words.txt') as file:
newwords = file.read()
if words != newwords:
words = newwords
try:
print newwords
stream.fetch("/1.1/statuses/filter.json?track="+tornado.escape.url_escape(words), callback=callback)
except:
pass
file.close()
interval_ms = 1000*10
scheduler = tornado.ioloop.PeriodicCallback(check_words,interval_ms,io_loop = main_io_loop)
scheduler.start()
main_io_loop.start()
Here is the error i'm getting
ERROR:root:Uncaught exception, closing connection.
Traceback (most recent call last):
File "/home/user/PycharmProjects/observrenv/local/lib/python2.7/site-packages/tornado/iostream.py", line 305, in wrapper
callback(*args)
File "/home/user/PycharmProjects/observrenv/src/tweetstream/tweetstream.py", line 155, in on_connect
self._twitter_stream.read_until("\r\n\r\n", self.on_headers)
File "/home/user/PycharmProjects/observrenv/local/lib/python2.7/site-packages/tornado/iostream.py", line 151, in read_until
self._set_read_callback(callback)
File "/home/user/PycharmProjects/observrenv/local/lib/python2.7/site-packages/tornado/iostream.py", line 369, in _set_read_callback
assert not self._read_callback, "Already reading"
AssertionError: Already reading
ERROR:root:Exception in callback <tornado.stack_context._StackContextWrapper object at 0x2415cb0>
Traceback (most recent call last):
File "/home/user/PycharmProjects/observrenv/local/lib/python2.7/site-packages/tornado/ioloop.py", line 421, in _run_callback
callback()
File "/home/user/PycharmProjects/observrenv/local/lib/python2.7/site-packages/tornado/iostream.py", line 305, in wrapper
callback(*args)
File "/home/user/PycharmProjects/observrenv/src/tweetstream/tweetstream.py", line 155, in on_connect
self._twitter_stream.read_until("\r\n\r\n", self.on_headers)
File "/home/user/PycharmProjects/observrenv/local/lib/python2.7/site-packages/tornado/iostream.py", line 151, in read_until
self._set_read_callback(callback)
File "/home/user/PycharmProjects/observrenv/local/lib/python2.7/site-packages/tornado/iostream.py", line 369, in _set_read_callback
assert not self._read_callback, "Already reading"
AssertionError: Already reading
I achieved better results (no the best) by starting the ioloop again when calling check_words.
stream = tweetstream.TweetStream(configuration,ioloop=main_io_loop)
stream.fetch("/1.1/statuses/filter.json?track="+tornado.escape.url_escape(words), callback=callback)
def check_words():
global words, stream
with open('words.txt') as file:
newwords = file.read()
if words != newwords:
words = newwords
print newwords
try:
stream = tweetstream.TweetStream(configuration,ioloop=main_io_loop)
stream.fetch("/1.1/statuses/filter.json?track="+tornado.escape.url_escape(words), callback=callback)
interval_ms = 1000*10
scheduler = tornado.ioloop.PeriodicCallback(check_words,interval_ms,io_loop = main_io_loop)
scheduler.start()
main_io_loop.start()
except:
pass
file.close()
interval_ms = 1000*10
scheduler = tornado.ioloop.PeriodicCallback(check_words,interval_ms,io_loop = main_io_loop)
scheduler.start()
main_io_loop.start()
As it was said here by a twitter employee, the recommended is to do what I am already doing (but in a more moderated way). Just reconnect once a while if your query terms changed. Otherwise just keep the connection open. It's also important to monitor errors that twitter might send it to you or you might be banned.
Looks, like you are missing main idea of Streaming API.
Connection to it opened permanently.
stream = tweetstream.TweetStream(configuration,ioloop=main_io_loop)
#What you are doing in callback?
stream.fetch("/1.1/statuses/filter.json?track="+tornado.escape.url_escape(words), callback=callback)
def check_words():
#I guess, don't do it at all.
#global words
#with open('words.txt') as file:
# newwords = file.read()
# if words != newwords:
# words = newwords
# try:
# #Don't open new stream here
# print newwords
# except:
# pass
# file.close()
pass
interval_ms = 1000*10
scheduler = tornado.ioloop.PeriodicCallback(check_words,interval_ms,io_loop = main_io_loop)
scheduler.start()
main_io_loop.start()
By analizing your code, I think you just must do routine with new words in callback.

Categories

Resources