I made a python script which uses tweepy streaming module to stream mentions to a twitter account and carry some functions based on the status text.
I wanted it to stream until a mention is made, next stop streaming, carry some functions based on the status text and again start streaming.
This is my code:
class StdOutListener(tweepy.StreamListener):
def on_data(self, data):
tweet = json.loads(data.strip())
global d
d=tweet
return False #stops streaming after a tweet is fed to it
def on_error(self, status_code):
print(status_code)
time.sleep(120)
return F # To continue listening
def on_timeout(self)
time.sleep(120)
return True # To continue listening
while True:
d={}
listener = StdOutListener()
stream = tweepy.Stream(twitter_auth(tokens), listener)
stream.filter(track=['#xxx'])
stream.disconnect()
doSomething(d)
But it only works for one loop and later shows 420(Exceeding Rate Limit) errors,even though I just take in a single tweet (per stream, if I'm not wrong).
Can anyone please explain where I'm doing it wrong? And also when should we use async mode in tweepy stream listener?
Related
I'm trying to generate keys for every message in Kafka, for that purpose I want to create a key generator that joins the topic first two characters and the tweet id.
Here is an example of the messages that get sent in kafka:
{"data":{"created_at":"2022-03-18T09:51:12.000Z","id":"1504757303811231755","text":"#Danielog111 #POTUS #NATO #UNPeacekeeping #UN Yes! Not to minimize Ukraine at all, but to bring attention to a horrific crisis and Tigrayan genocide that targets 7M people, longer time frame, and is largely unacknowledged by western news agencies. And people are being eaten-literally! #maddow #JoyAnnReid help Ethiopians!"},"matching_rules":[{"id":"1502932028618072070","tag":"NATO"},{"id":"1502932021731115013","tag":"Biden"}]}'
And here is my code modified to try generating partition keys (I'm using PyKafka):
from dotenv import load_dotenv
import os
import json
import tweepy
from pykafka import KafkaClient
# Getting credentials:
BEARER_TOKEN=os.getenv("BEARER_TOKEN")
# Setting up pykafka:
def get_kafka_client():
return KafkaClient(hosts='localhost:9092,localhost:9093,localhost:9094')
def send_message(data, name_topic, id):
client = get_kafka_client()
topic = client.topics[name_topic]
producer = topic.get_sync_producer()
producer.produce(data, partition_key=f"{name_topic[:2]}{id}")
# Creating a Twitter stream listener:
class Listener(tweepy.StreamingClient):
def on_data(self, data):
print(data)
message = json.loads(data)
for rule in message['matching_rules']:
send_message(data, rule['tag'], message['data']['id'].encode())
return True
def on_error(self, status):
print(status)
# Start streaming:
Listener(BEARER_TOKEN).filter(tweet_fields=['created_at'])
And this is the error I'm getting:
File "/Users/mac/.local/share/virtualenvs/tweepy_step-Ck3DvAWI/lib/python3.9/site-packages/pykafka/producer.py", line 372, in produce
raise TypeError("Producer.produce accepts a bytes object as partition_key, "
TypeError: ("Producer.produce accepts a bytes object as partition_key, but it got '%s'", <class 'str'>)
I've also tried not encoding it and trying to fetch the id just using the data (that comes in bytes) but none of these options work.
I found the error, I should've been encoding the partition key and not the json id:
def send_message(data, name_topic, id):
client = get_kafka_client()
topic = client.topics[name_topic]
producer = topic.get_sync_producer()
producer.produce(data, partition_key=f"{name_topic[:2]}{id}".encode())
# Creating a Twitter stream listener:
class Listener(tweepy.StreamingClient):
def on_data(self, data):
print(data)
message = json.loads(data)
for rule in message['matching_rules']:
send_message(data, rule['tag'], message['data']['id'])
return True
def on_error(self, status):
print(status)
I'm trying to run AsyncStream Tweepy, but I ran into a problem
My code
from __future__ import absolute_import, print_function
from tweepy.streaming import Stream
from tweepy import OAuthHandler
from tweepy import Stream
from pprint import pprint
from tweepy.asynchronous import AsyncStream
import asyncio
async def main():
stream = StdOutListener(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
await stream.filter(follow=['1082189695252074496'])
await asyncio.sleep(1.5)
class StdOutListener(AsyncStream):
async def on_status(self, status):
print(status_json)
async def on_error(self, status):
print(status)
if __name__ == '__main__':
asyncio.run(main())
When I run it in .py file, it doesn't work and returns the error "An HTTP: 420 error occurred in the stream".
I also run the code in Jupyter Notebook, only instead of async io.run(main ()), I write await main(), it also returns this error, BUT the stream works and it returns a response.
Why does it work in Jupyter Notebook, but does not work in the .py file. How can this be fixed?
According to the Tweepy documentation section on Handling Errors:
If clients exceed a limited number of attempts to connect to the streaming API in a window of time, they will receive error 420. The amount of time a client has to wait after receiving error 420 will increase exponentially each time they make a failed attempt.
Tweepy’s Stream Listener passes error codes to an on_error stub. The default implementation returns False for all codes, but we can override it to allow Tweepy to reconnect for some or all codes...
Here's another reference to the Twitter API documentation section on HTTP Error Codes.
The first isssue is that print(status_json) should be print(status._json).
The second issue is that the on_status method needs to conditionally return True or False based on status._json, like so:
async def on_status(self, status):
if hasattr(status, "_json"):
print(status._json)
# returning non-False continues the stream
return True
else:
# returning False disconnects the stream
return False
The third issue is that the on_error method needs to conditionally return True or False based on the value of status, like so:
async def on_error(self, status):
if status == 420:
# returning False disconnects the stream
return False
else:
# returning non-False continues the stream
return True
Trying to check whether connection established or not, but nothing happened
I used on_connect to understand but got nothing:
import tweepy
import time
class InOutStreamListener(tweepy.StreamListener):
def on_connect(self):
print 'Connected'
def disconnect(self):
if self.running is False:
return
self.running = False
def on_friends(self, friends):
print friends[0]
auth = tweepy.OAuthHandler('code', 'code')
auth.set_access_token('code', 'code')
l = InOutStreamListener()
streamer = tweepy.Stream(auth, l)
time.sleep(15)
streamer.disconnect()
You only created a Stream, you didn't start it, see the docs.
In this example we will use filter to stream all tweets containing the
word python. The track parameter is an array of search terms to
stream.
myStream.filter(track=['python'])
I'm using Flask and Tweepy to search for live tweets. On the front-end I have a user text input, and button called "Search". Ideally, when a user gives a search-term into the input and clicks the "Search" button, the Tweepy should listen for the new search-term and stop the previous search-term stream. When the "Search" button is clicked it executes this function:
#app.route('/search', methods=['POST'])
# gets search-keyword and starts stream
def streamTweets():
search_term = request.form['tweet']
search_term_hashtag = '#' + search_term
# instantiate listener
listener = StdOutListener()
# stream object uses listener we instantiated above to listen for data
stream = tweepy.Stream(auth, listener)
if stream is not None:
print "Stream disconnected..."
stream.disconnect()
stream.filter(track=[search_term or search_term_hashtag], async=True)
redirect('/stream') # execute '/stream' sse
return render_template('index.html')
The /stream route that is executed in the second to last line in above code is as follows:
#app.route('/stream')
def stream():
# we will use Pub/Sub process to send real-time tweets to client
def event_stream():
# instantiate pubsub
pubsub = red.pubsub()
# subscribe to tweet_stream channel
pubsub.subscribe('tweet_stream')
# initiate server-sent events on messages pushed to channel
for message in pubsub.listen():
yield 'data: %s\n\n' % message['data']
return Response(stream_with_context(event_stream()), mimetype="text/event-stream")
My code works fine, in the sense that it starts a new stream and searches for a given term whenever the "Search" button is clicked, but it does not stop the previous search. For example, if my first search term was "NYC" and then I wanted to search for a different term, say "Los Angeles", it will give me results for both "NYC" and "Los Angeles", which is not what I want. I want just "Los Angeles" to be searched. How do I fix this? In other words, how do I stop the previous stream? I looked through other previous threads, and I know I have to use stream.disconnect(), but I'm not sure how to implement this in my code. Any help or input would be greatly appreciated. Thanks so much!!
Below is some code that will cancel old streams when a new stream is created. It works by adding new streams to a global list, and then calling stream.disconnect() on all streams in the list whenever a new stream is created.
diff --git a/app.py b/app.py
index 1e3ed10..f416ddc 100755
--- a/app.py
+++ b/app.py
## -23,6 +23,8 ## auth.set_access_token(access_token, access_token_secret)
app = Flask(__name__)
red = redis.StrictRedis()
+# Add a place to keep track of current streams
+streams = []
#app.route('/')
def index():
## -32,12 +34,18 ## def index():
#app.route('/search', methods=['POST'])
# gets search-keyword and starts stream
def streamTweets():
+ # cancel old streams
+ for stream in streams:
+ stream.disconnect()
+
search_term = request.form['tweet']
search_term_hashtag = '#' + search_term
# instantiate listener
listener = StdOutListener()
# stream object uses listener we instantiated above to listen for data
stream = tweepy.Stream(auth, listener)
+ # add this stream to the global list
+ streams.append(stream)
stream.filter(track=[search_term or search_term_hashtag],
async=True) # make sure stream is non-blocking
redirect('/stream') # execute '/stream' sse
What this does not solve is the problem of session management. With your current setup a search by one user will affect the searches of all users. This can be avoided by giving your users some identifier and storing their streams along with their identifier. The easiest way to do this is likely to use Flask's session support. You could also do this with a requestId as Pierre suggested. In either case you will also need code to notice when a user has closed the page and close their stream.
Disclaimer: I know nothing about Tweepy, but this appears to be a design issue.
Are you trying to add state to a RESTful API? You may have a design problem.
As JRichardSnape answered, your API shouldn't be the one taking care of canceling a request; it should be done in the front-end. What I mean here is in the javascript / AJAX / etc calling this function, add another call, to the new function
#app.route('/cancelSearch', methods=['POST'])
With the "POST" that has the search terms. So long as you don't have state, you can't really do this safely in an async call: Imagine someone else makes the same search at the same time then canceling one will cancel both (remember, you don't have state so you don't know who you're canceling). Perhaps you do need state with your design.
If you must keep using this and don't mind breaking the "stateless" rule, then add a "state" to your request. In this case it's not so bad because you could launch a thread and name it with the userId, then kill the thread every new search
def streamTweets():
search_term = request.form['tweet']
userId = request.form['userId'] # If your limit is one request per user at a time. If multiple windows can be opened and you want to follow this limit, store userId in a cookie.
#Look for any request currently running with this ID, and cancel them
Alternatively, you could return a requestId, which you would then keep in the front-end can call cancelSearch?requestId=$requestId. In cancelSearch, you would have to find the pending request (sounds like that's in tweepy since you're not using your own threads) and disconnect it.
Out of curiosity I just watched what happens when you search on Google, and it uses a GET request. Have a look (debug tools -> Network; then enter some text and see the autofill). Google uses a token sent with every request (every time you type something)). It doesn't mean it's used for this, but that's basically what I described. If you don't want a session, then use a unique identifier.
Well I solved it by using timer method But still I'm looking for pythonic way.
from streamer import StreamListener
def stream():
hashtag = input
#assign each user an ID ( for pubsub )
StreamListener.userid = random_user_id
def handler(signum, frame):
print("Forever is over")
raise Exception("end of time")
def main_stream():
stream = tweepy.Stream(auth, StreamListener())
stream.filter(track=track,async=True)
redirect(url_for('map_stream'))
def close_stream():
# this is for closing client list in redis but don't know it's working
obj = redis.client_list(tweet_stream)
redis_client_list = obj[0]['addr']
redis.client_kill(redis_client_list)
stream = tweepy.Stream(auth, StreamListener())
stream.disconnect()
import signal
signal.signal(signal.SIGALRM, handler)
signal.alarm(300)
try:
main_stream()
except Exception:
close_stream()
print("function terminate")
So far I have the following code that works and inserts the tweets into my mongodb but I had a few questions.
class CustomStreamListener(tweepy.StreamListener):
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()
self.db = pymongo.MongoClient().test
def on_data(self, tweet):
self.db.tweets.insert(json.loads(tweet))
def on_error(self, status_code):
return True # Don't kill the stream
def on_timeout(self):
return True # Don't kill the stream
sapi = tweepy.streaming.Stream(auth, CustomStreamListener(api))
sapi.filter(track=['arsenal'] , languages = ['en'])
Could someone explain how I can get only certain parts of the tweet inserted into the database ie. just the tweet text and location.
Does the twitter streaming api allow displaying just tweets no # reply tweets?
json.loads(tweet) is just a dictionary, you can freely choose what parts of its key-values you process.
You can filter tweets by conditioning them either way you like:
tweet_obj = json.loads(tweet)
if not tweet_obj['in_reply_to_user_id']: # replies has `None` in this field
pass # add some processing here