I'm trying to run AsyncStream Tweepy, but I ran into a problem
My code
from __future__ import absolute_import, print_function
from tweepy.streaming import Stream
from tweepy import OAuthHandler
from tweepy import Stream
from pprint import pprint
from tweepy.asynchronous import AsyncStream
import asyncio
async def main():
stream = StdOutListener(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
await stream.filter(follow=['1082189695252074496'])
await asyncio.sleep(1.5)
class StdOutListener(AsyncStream):
async def on_status(self, status):
print(status_json)
async def on_error(self, status):
print(status)
if __name__ == '__main__':
asyncio.run(main())
When I run it in .py file, it doesn't work and returns the error "An HTTP: 420 error occurred in the stream".
I also run the code in Jupyter Notebook, only instead of async io.run(main ()), I write await main(), it also returns this error, BUT the stream works and it returns a response.
Why does it work in Jupyter Notebook, but does not work in the .py file. How can this be fixed?
According to the Tweepy documentation section on Handling Errors:
If clients exceed a limited number of attempts to connect to the streaming API in a window of time, they will receive error 420. The amount of time a client has to wait after receiving error 420 will increase exponentially each time they make a failed attempt.
Tweepy’s Stream Listener passes error codes to an on_error stub. The default implementation returns False for all codes, but we can override it to allow Tweepy to reconnect for some or all codes...
Here's another reference to the Twitter API documentation section on HTTP Error Codes.
The first isssue is that print(status_json) should be print(status._json).
The second issue is that the on_status method needs to conditionally return True or False based on status._json, like so:
async def on_status(self, status):
if hasattr(status, "_json"):
print(status._json)
# returning non-False continues the stream
return True
else:
# returning False disconnects the stream
return False
The third issue is that the on_error method needs to conditionally return True or False based on the value of status, like so:
async def on_error(self, status):
if status == 420:
# returning False disconnects the stream
return False
else:
# returning non-False continues the stream
return True
I'm watching this series https://www.youtube.com/watch?v=wlnx-7cm4Gg&list=PL5tcWHG-UPH2zBfOz40HSzcGUPAVOOnu1 which is about mining tweets with tweepy (python) and the guy stores the tweets with everything ( such as created_at, id, id_str, text) and then he uses Dataframes in pandas to store only the text. Is this way efficient ? How Can I only store the "text" in the Json file instead of all other details ?
The code:
ACCESS_TOKEN = "xxxxxxxxxxxxxxxxxxxxx"
ACCESS_TOKEN_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
import tweepy
import numpy as np
import pandas as pd
# import twitter_credentials
class TwitterAuthenticator():
def authenticate_twitter_app(self):
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
return auth
class TwitterStreamer():
"""
Class for streaming and processing live tweets.
"""
def __init__(self):
self.twitter_authenticator = TwitterAuthenticator()
def stream_tweets(self, fetched_tweets_filename, hash_tag):
# This handles Twitter authetification and the connection to Twitter Streaming API
listener = TwitterListener(fetched_tweets_filename)
auth = self.twitter_authenticator.authenticate_twitter_app()
# api = tweepy.API(auth)
stream = tweepy.Stream(auth,listener)
stream.filter(track = hash_tag)
class TwitterListener(tweepy.StreamListener):
"""
This is a basic listener class that just prints received tweets to stdout.
"""
def __init__(self, fetched_tweets_filename):
self.fetched_tweets_filename = fetched_tweets_filename
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
def on_status(self, status):
print(status)
def on_error(self, status):
if status == 420:
# Returning False on_data method in case rate limit occurs.
return False
print(status)
# public_tweets = api.home_timeline()
# for tweet in public_tweets:
# print tweet.text
if __name__ == '__main__':
hash_tag = ["python"]
fetched_tweets_filename = "tweets.json"
twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets(fetched_tweets_filename,hash_tag)
# print stream.text
The tweet stored in the json file:
{"created_at":"Sun Nov 04 18:43:59 +0000 2018","id":1059154305498972160,"id_str":"1059154305498972160","text":"RT #hmason: When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn h\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":14858491,"id_str":"14858491","name":"Alexandra Lemus","screen_name":"nankyoku","location":"M\u00e9xico","url":null,"description":"Transitioning into the Permanent Beta state...","translator_type":"none","protected":false,"verified":false,"followers_count":173,"friends_count":585,"listed_count":18,"favourites_count":658,"statuses_count":572,"created_at":"Wed May 21 16:35:49 +0000 2008","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_link_color":"088253","profile_sidebar_border_color":"D3D2CF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/14858491\/1381524599","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat Nov 03 17:36:24 +0000 2018","id":1058774912201035776,"id_str":"1058774912201035776","text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yoursel\u2026 https:\/\/t.co\/9F7SmlGfyf","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":765548,"id_str":"765548","name":"Hilary Mason","screen_name":"hmason","location":"NYC","url":"http:\/\/www.hilarymason.com","description":"GM for Machine Learning at #Cloudera. Founder at #FastForwardLabs. Data Scientist in Residence at #accel. I \u2665 data and cheeseburgers.","translator_type":"none","protected":false,"verified":true,"followers_count":111311,"friends_count":1539,"listed_count":5276,"favourites_count":12049,"statuses_count":17602,"created_at":"Sun Feb 11 21:22:24 +0000 2007","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"282F8A","profile_sidebar_border_color":"87BC44","profile_sidebar_fill_color":"AB892B","profile_text_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/765548\/1353033581","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn how it works, and then use a library to benefit from robust code.\n\nHere's one article showing this with neural networks in Python: https:\/\/t.co\/3ehO86NFKI","display_text_range":[0,280],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/3ehO86NFKI","expanded_url":"https:\/\/towardsdatascience.com\/how-to-build-your-own-neural-network-from-scratch-in-python-68998a08e4f6","display_url":"towardsdatascience.com\/how-to-build-y\u2026","indices":[257,280]}],"user_mentions":[],"symbols":[]}},"quote_count":14,"reply_count":8,"retweet_count":290,"favorite_count":1019,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/9F7SmlGfyf","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1058774912201035776","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"hmason","name":"Hilary Mason","id":765548,"id_str":"765548","indices":[3,10]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1541357039223"}
If the question is not clear then please comment it out and I will try to edit the question.
If you want only the "text" field to be saved in the json file, you can tweak the definition of the TwitterListener.on_data method:
import json
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
json_load = json.loads(data)
text = {'text': json_load['text']}
tf.write(json.dumps(text))
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
Fair warning, I don't have tweepy installed/set up, so I was only able to test a version of the above code using the json file you posted above. Let me know if you run into any bugs and I'll see what I can do.
It looks like what you're getting from the API and storing in your variable "data" is unicode text in a json format. You are just writing that text directly to a file. Using the API call you do, you're always going to get all of the data so it isn't that inefficient. If you just wanted to get/write the text of the tweet, try using a json load and then processing from there.
So far I have the following code that works and inserts the tweets into my mongodb but I had a few questions.
class CustomStreamListener(tweepy.StreamListener):
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()
self.db = pymongo.MongoClient().test
def on_data(self, tweet):
self.db.tweets.insert(json.loads(tweet))
def on_error(self, status_code):
return True # Don't kill the stream
def on_timeout(self):
return True # Don't kill the stream
sapi = tweepy.streaming.Stream(auth, CustomStreamListener(api))
sapi.filter(track=['arsenal'] , languages = ['en'])
Could someone explain how I can get only certain parts of the tweet inserted into the database ie. just the tweet text and location.
Does the twitter streaming api allow displaying just tweets no # reply tweets?
json.loads(tweet) is just a dictionary, you can freely choose what parts of its key-values you process.
You can filter tweets by conditioning them either way you like:
tweet_obj = json.loads(tweet)
if not tweet_obj['in_reply_to_user_id']: # replies has `None` in this field
pass # add some processing here