I used a python script to stream tweets and store them in a Mongodb database using tweepy. Everything was working just fine, but when I tried to set a limit for the number of tweets, tweets are no longer retrieved, I can't see the error, could you please help?
class MyListener(StreamListener):
def __init__(self, num_tweets, max_tweets=60):
self.numTweets = num_tweets
self.limit = max_tweets
def on_data(self, data):
print("tweet")
while self.numTweets < self.limit:
#self.numTweets+=1
try:
client = MongoClient()
db = client['twitter1_db']
collection = db['twitter1_collection']
tweet = json.loads(data)
collection.insert_one(tweet)
print(tweet)
self.numTweets= self.numTweets+ 1
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)`
twitter_stream = Stream(auth, MyListener(num_tweets, max_tweets))
twitter_stream.filter(track=Keywords_list)
Related
I am consuming low latency market data and I'm trying to measure how many streams I can consume without my code slowing down due to the websocket message queue building up. My understanding is that messages are received by the websocket and queued until ws.recv() is called, which processes them one at a time in the order they were received. Under normal circumstances, my code is definitely fast enough to handle the messages, but when a burst of messages comes all at once I would imagine that the queue fills up. I would expect that the queue would only be filled up for 5 or 10 milliseconds, but it is very important that I know this. Is there a way to measure how many messages are waiting in the queue?
I'm attaching a snippet of the code I'm using for context, but the relevant part is just looping over
data = self.ws.recv()
class WebsocketClient(object):
def __init__(
self,
url=""
products=None,
message_type="subscribe",
should_print=True,
self.url = url
self.products = products
self.channels = channels
self.type = message_type
self.stop = True
self.error = None
self.ws = None
self.thread = None
self.auth = auth
self.api_key = api_key
self.api_secret = api_secret
self.api_passphrase = api_passphrase
self.should_print = should_print
def start(self):
def _go():
self._connect()
self._listen()
self._disconnect()
self.stop = False
self.on_open()
self.thread = Thread(target=_go)
self.keepalive = Thread(target=self._keepalive)
self.thread.start()
def _connect(self):
if self.products is None:
self.products = []
elif not isinstance(self.products, list):
self.products = [self.products]
if self.url[-1] == "/":
self.url = self.url[:-1]
if self.channels is None:
self.channels = [{"name": "ticker", "product_ids": [product_id for product_id in self.products]}]
sub_params = {'type': 'subscribe', 'product_ids': self.products, 'channels': self.channels}
else:
sub_params = {'type': 'subscribe', 'product_ids': self.products, 'channels': self.channels}
if self.auth:
#timestamp = int(time.time())
#message = timestamp + 'GET' + '/users/self/verify'
auth_headers = get_auth_headers('/users/self/verify','GET','')
#print(auth_headers)
sub_params['signature'] = auth_headers['CB-ACCESS-SIGN']
sub_params['key'] = auth_headers['CB-ACCESS-KEY']
sub_params['passphrase'] = auth_headers['CB-ACCESS-PASSPHRASE']
sub_params['timestamp'] = auth_headers['CB-ACCESS-TIMESTAMP']
try:
self.ws = create_connection(self.url)
self.ws.send(json.dumps(sub_params))
except:
traceback.print_exc()
self.stop = True
def _keepalive(self, interval=10):
while self.ws.connected:
self.ws.ping("keepalive")
time.sleep(interval)
def _listen(self):
self.keepalive.start()
while not self.stop:
try:
data = self.ws.recv()
msg = json.loads(data)
except ValueError as e:
self.on_error(e)
except Exception as e:
self.on_error(e)
else:
self.on_message(msg)
def _disconnect(self):
try:
if self.ws:
self.ws.close()
except WebSocketConnectionClosedException as e:
pass
finally:
self.keepalive.join()
self.on_close()
def close(self):
self.stop = True # will only disconnect after next msg recv
self._disconnect() # force disconnect so threads can join
self.thread.join()
def on_open(self):
if self.should_print:
print("-- Subscribed! --\n")
def on_close(self):
if self.should_print:
print("\n-- Socket Closed --")
def on_message(self, msg):
*** my logic ***
def on_error(self, e, data=None):
self.error = e
self.stop = True
print('{} - data: {}'.format(e, data))
You can measure length of incomming messages buffer by calling
len(self.ws.messages)
There is a background asyncio task, that reads StreamReader bytes buffer and put messages to ws.messages deque.
Messages deque is limited by max_queue parameter of client.connect method:
https://websockets.readthedocs.io/en/stable/reference/client.html#websockets.client.connect
Here are the details:
https://websockets.readthedocs.io/en/stable/topics/design.html#backpressure
I need to get the tweets from a single user in a streaming format. However, it still displays all tweets that retweet this user or are a reply to the tweets.
topic = "tweets"
accounts = ['user_id1', 'user_id2']
class TwitterStreamer():
def __init__(self):
pass
def stream_tweets(self, topic, accounts):
listener = StreamListener(topic)
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_secret_token)
stream = tweepy.Stream(auth, listener)
stream.filter(follow=accounts)
class StreamListener(tweepy.StreamListener):
def __init__(self, file_prefix):
self.prefix = file_prefix
#property
def fetched_tweets_filename(self):
topic
date = datetime.datetime.now().strftime("%Y-%m-%d")
return f"{self.prefix}_{date}.txt"
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
def on_exception(self, exception):
print('exception', exception)
stream_tweets(topic, accounts)
def on_status(self, accounts, status):
if status.user.id_str != accounts:
return
print(status.text)
def stream_tweets(topic, accounts):
listener = StreamListener(topic)
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_secret_token)
stream = tweepy.Stream(auth, listener)
stream.filter(track=accounts)
if __name__ == '__main__':
twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets(topic, accounts)
I don't know what I'm doing wrong but I feel like the on_status command does not work at all.
Thanks for your help!
Don't change the parameters for on_status. Your accounts variable is a global variable and you should use it as such. Also, status.user.id_str is a str but accounts is a List[str]. You need the not ... in ... operators as opposed to !=. In other words, try out the changes below:
def on_status(self, status):
if not status.user.id_str in accounts:
return
print(status.text)
I am trying to stop the streaming after 100 tweets. Any help will be appreciated. I just want 100 tweets
class listener(StreamListener):
def on_data(self, data):
all_data = json.loads(data)
tweet = all_data["text"]
username = all_data["user"]["screen_name"]
return(True)
def on_error(self, status):
print(status)
My application is to retweet a hashtag using Tweepy. Retweeting the hashtag works, I am having trouble getting errors 2 and 3 to work.
Errors
1. Your own id (done)
2. If tweet has already been RTd
3. If tweet to RT comes from protected source
Accessing api.retweet(doTweet) within the StdOutListener does not allow it to fall through to on_error(). How else can I do this? I am Python noob.
class StdOutListener(tweepy.StreamListener):
def on_data(self, data):
all_data = json.loads(data)
username = all_data["user"]["screen_name"]
doTweet = all_data["id"]
if username != our_own_id:
#make sure you haven't already retweeted
#make sure tweets aren't protected
print(username) # just so we know it's working
api.retweet(doTweet)
return True
def on_error(self, status_code):
print('error')
read_error = json.loads(status_code)
print('Got an error with status code: ' + str(read_error))
return True # To continue listening
def on_timeout(self):
print('Timeout...')
return True # To continue listening
try:
if __name__ == '__main__':
listener = StdOutListener()
stream = tweepy.Stream(auth, listener)
stream.filter(track=['#love'])
except KeyboardInterrupt:
sys.exit()
You could modify your call to api.retweet in your on_data method as follows:
...
if username != our_own_id:
print(username) # just so we know it's working
try:
api.retweet(doTweet)
except tweepy.TweepError as e:
# add here a more complex error handling
print(e)
Hope it helps.
I am trying to stream twitter data for a period of time of say 5 minutes, using the Stream.filter() method. I am storing the retrieved tweets in a JSON file. The problem is I am unable to stop the filter() method from within the program. I need to stop the execution manually. I tried stopping the data based on system time using the time package. I was able to stop writing tweets to the JSON file but the stream method is still going on, but It was not able to continue to the next line of code.
I am using IPython notebook to write and execute the code.
Here's the code:
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
from tweepy import Stream
from tweepy.streaming import StreamListener
class MyListener(StreamListener):
def __init__(self, start_time, time_limit=60):
self.time = start_time
self.limit = time_limit
def on_data(self, data):
while (time.time() - self.time) < self.limit:
try:
saveFile = open('abcd.json', 'a')
saveFile.write(data)
saveFile.write('\n')
saveFile.close()
return True
except BaseException as e:
print 'failed ondata,', str(e)
time.sleep(5)
return True
def on_status(self, status):
if (time.time() - self.time) >= self.limit:
print 'time is over'
return false
def on_error(self, status):
if (time.time() - self.time) >= self.limit:
print 'time is over'
return false
else:
print(status)
return True
start_time = time.time()
stream_data = Stream(auth, MyListener(start_time,20))
stream_data.filter(track=['name1','name2',...list ...,'name n'])#list of the strings I want to track
These links are similar but I does not answer my question directly
Tweepy: Stream data for X minutes?
Stopping Tweepy steam after a duration parameter (# lines, seconds, #Tweets, etc)
Tweepy Streaming - Stop collecting tweets at x amount
I used this link as my reference,
http://stats.seandolinar.com/collecting-twitter-data-using-a-python-stream-listener/
In order to close the stream you need to return False from on_data(), or on_status().
Because tweepy.Stream() runs a while loop itself, you don't need the while loop in on_data().
When initializing MyListener, you didn't call the parent's class __init__ method, so it wasn't initialized properly.
So for what you're trying to do, the code should be something like:
class MyStreamListener(tweepy.StreamListener):
def __init__(self, time_limit=60):
self.start_time = time.time()
self.limit = time_limit
self.saveFile = open('abcd.json', 'a')
super(MyStreamListener, self).__init__()
def on_data(self, data):
if (time.time() - self.start_time) < self.limit:
self.saveFile.write(data)
self.saveFile.write('\n')
return True
else:
self.saveFile.close()
return False
myStream = tweepy.Stream(auth=api.auth, listener=MyStreamListener(time_limit=20))
myStream.filter(track=['test'])
Access the variable myListener.running but instead of passing MyListener directly to Stream create a variable as follows:
myListener = MyListener()
timeout code here... suchas time.sleep(20)
myListener.running = False
So, I was having this issue as well. Fortunately Tweepy is open source so it's easy so dig into the problem.
Basically the important part is this here:
def _data(self, data):
if self.listener.on_data(data) is False:
self.running = False
On Stream class in streaming.py
That means, to close the connection you just have to return false on the listener's on_data() method.
For those who are trying with Twitter api V2 (StreamingClient class), here is the solution:
client.disconnect()