Attempt to extract Twitter data from London location - python

very new to this but I need twitter data from the area of London from the streaming api service
This is the code i'm running
from tweepy import Stream
from tweepy.streaming import StreamListener
class MyListener(StreamListener):
def on_data(self, data):
try:
with open('london.json', 'a') as f:
f.write(data)
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)
return True
twitter_stream = Stream(auth, MyListener())
twitter_stream.filter(locations=['0.28,51.23,0.25,51.68'])
This is the error I keep getting
TweepError: Wrong number of locations points, it has to be a multiple of 4
I may be getting the bounding box wrong but am unsure how to find it for London??
Thanks

Related

except BaseException as e: keeps throwing an error

I'm following a tutorial about analyzing twitter data. I'm wondering why I keep getting a syntax error on line 44: except BaseException as e:
from tweepy import API
from tweepy import Cursor
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import twitter_credentials
#TWITTER AUTHENTICATOR
class TwitterAuthenticator():
def authenticate_twitter_app(self):
auth = OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)
auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)
return auth
#TWITTER STREAMER
class TwitterStreamer():
#Class for streaming and processing live tweets
def __init__(self):
self.twitter_authenticator = TwitterAuthenticator()
def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
#This handles Twitter authentication and connection to the Twitter streaming API
listener = TwitterListener()
auth = self.twitter_authenticator.authenticate_twitter_app()
stream = Stream(auth, listener)
stream.filter(track=hash_tag_list)
class TwitterListener(StreamListener):
#Basic listener class that just prints received tweets to stdout
def __init__(self, fetched_tweets_filename):
self.fetched_tweets_filename = fetched_tweets_filename
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException as e:
print('Error on_data %s' % str(e))
return True
def on_error(self, status):
print(status)
if __name__ == '__main__':
hash_tag_list['kevin durant', 'steph curry', 'clippers']
fetched_tweets_filename = 'tweets.json'
twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets(fetched_tweets_filename, hash_tag_list)
Your except is indented too much. Should be on the same level as try (in on_data()) and the code in except should be indented the same.
Btw said function is written wrong. There are potential cases where it returns nothing. You should have at least return False added at the end of function body.
Except should be indented as try, so try the following
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException as e:
print('Error on_data %s' % str(e))
return True

How to store only the text of tweet using Tweepy

I'm watching this series https://www.youtube.com/watch?v=wlnx-7cm4Gg&list=PL5tcWHG-UPH2zBfOz40HSzcGUPAVOOnu1 which is about mining tweets with tweepy (python) and the guy stores the tweets with everything ( such as created_at, id, id_str, text) and then he uses Dataframes in pandas to store only the text. Is this way efficient ? How Can I only store the "text" in the Json file instead of all other details ?
The code:
ACCESS_TOKEN = "xxxxxxxxxxxxxxxxxxxxx"
ACCESS_TOKEN_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
import tweepy
import numpy as np
import pandas as pd
# import twitter_credentials
class TwitterAuthenticator():
def authenticate_twitter_app(self):
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
return auth
class TwitterStreamer():
"""
Class for streaming and processing live tweets.
"""
def __init__(self):
self.twitter_authenticator = TwitterAuthenticator()
def stream_tweets(self, fetched_tweets_filename, hash_tag):
# This handles Twitter authetification and the connection to Twitter Streaming API
listener = TwitterListener(fetched_tweets_filename)
auth = self.twitter_authenticator.authenticate_twitter_app()
# api = tweepy.API(auth)
stream = tweepy.Stream(auth,listener)
stream.filter(track = hash_tag)
class TwitterListener(tweepy.StreamListener):
"""
This is a basic listener class that just prints received tweets to stdout.
"""
def __init__(self, fetched_tweets_filename):
self.fetched_tweets_filename = fetched_tweets_filename
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
def on_status(self, status):
print(status)
def on_error(self, status):
if status == 420:
# Returning False on_data method in case rate limit occurs.
return False
print(status)
# public_tweets = api.home_timeline()
# for tweet in public_tweets:
# print tweet.text
if __name__ == '__main__':
hash_tag = ["python"]
fetched_tweets_filename = "tweets.json"
twitter_streamer = TwitterStreamer()
twitter_streamer.stream_tweets(fetched_tweets_filename,hash_tag)
# print stream.text
The tweet stored in the json file:
{"created_at":"Sun Nov 04 18:43:59 +0000 2018","id":1059154305498972160,"id_str":"1059154305498972160","text":"RT #hmason: When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn h\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":14858491,"id_str":"14858491","name":"Alexandra Lemus","screen_name":"nankyoku","location":"M\u00e9xico","url":null,"description":"Transitioning into the Permanent Beta state...","translator_type":"none","protected":false,"verified":false,"followers_count":173,"friends_count":585,"listed_count":18,"favourites_count":658,"statuses_count":572,"created_at":"Wed May 21 16:35:49 +0000 2008","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_link_color":"088253","profile_sidebar_border_color":"D3D2CF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/14858491\/1381524599","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat Nov 03 17:36:24 +0000 2018","id":1058774912201035776,"id_str":"1058774912201035776","text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yoursel\u2026 https:\/\/t.co\/9F7SmlGfyf","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":765548,"id_str":"765548","name":"Hilary Mason","screen_name":"hmason","location":"NYC","url":"http:\/\/www.hilarymason.com","description":"GM for Machine Learning at #Cloudera. Founder at #FastForwardLabs. Data Scientist in Residence at #accel. I \u2665 data and cheeseburgers.","translator_type":"none","protected":false,"verified":true,"followers_count":111311,"friends_count":1539,"listed_count":5276,"favourites_count":12049,"statuses_count":17602,"created_at":"Sun Feb 11 21:22:24 +0000 2007","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"282F8A","profile_sidebar_border_color":"87BC44","profile_sidebar_fill_color":"AB892B","profile_text_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/765548\/1353033581","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn how it works, and then use a library to benefit from robust code.\n\nHere's one article showing this with neural networks in Python: https:\/\/t.co\/3ehO86NFKI","display_text_range":[0,280],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/3ehO86NFKI","expanded_url":"https:\/\/towardsdatascience.com\/how-to-build-your-own-neural-network-from-scratch-in-python-68998a08e4f6","display_url":"towardsdatascience.com\/how-to-build-y\u2026","indices":[257,280]}],"user_mentions":[],"symbols":[]}},"quote_count":14,"reply_count":8,"retweet_count":290,"favorite_count":1019,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/9F7SmlGfyf","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1058774912201035776","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"hmason","name":"Hilary Mason","id":765548,"id_str":"765548","indices":[3,10]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1541357039223"}
If the question is not clear then please comment it out and I will try to edit the question.
If you want only the "text" field to be saved in the json file, you can tweak the definition of the TwitterListener.on_data method:
import json
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
json_load = json.loads(data)
text = {'text': json_load['text']}
tf.write(json.dumps(text))
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
Fair warning, I don't have tweepy installed/set up, so I was only able to test a version of the above code using the json file you posted above. Let me know if you run into any bugs and I'll see what I can do.
It looks like what you're getting from the API and storing in your variable "data" is unicode text in a json format. You are just writing that text directly to a file. Using the API call you do, you're always going to get all of the data so it isn't that inefficient. If you just wanted to get/write the text of the tweet, try using a json load and then processing from there.

Twitter Streaming - Find Top 10 trending topics | PySpark

Am doing a project to find top 10 trending topics or hashtags on Twitter. Am creating a class with the code below:
class TweetsListener(StreamListener):
def __init__(self, csocket):
self.client_socket = csocket
def on_data(self, data):
try:
msg = json.loads( data )
print(msg['user']['screen_name'].encode('utf-8'))
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)
return True
Below is the code for sending data:
def sendData(c_socket):
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitter_stream = Stream(auth, TweetsListener(c_socket))
twitter_stream.filter(track=['india']
Here twitter_stream.filter is filtering messages with tag India. I want to get all the messages from Twitter. In short, I do not want a filter to be applied. Is there a way to do the same?
Any help appreciated.
- P.S : Novice in Spark streaming and PySpark
Twitter now offers a sample stream: https://developer.twitter.com/en/docs/tweets/sample-realtime/overview/GET_statuse_sample.html
It's fairly new so I'm not sure if the wrappers (looks like you're using Tweepy) have implemented it yet, but it shouldn't be hard to interface with.

Python Tweepy occasional exception during streaming

I'm using this code to capture the content of live streams using the 'text' identifier which basically captures the actual tweet text from Tweepys streaming capture function. It seems to be working for the most part, but I also receive the occasional "Not Working" message as a result of the exception in the code when I run the script (it happen maybe once every 10 10 seconds or so when I search a term that is trending heavily). The output file still capture many tweets despite this. I was wondering if anyone knew what could be causing this exception, as I'm worried I may be missing the occasional tweet as a result of it.
class MyListener(StreamListener):
def on_data(self, data):
try:
with open('python.json', 'a') as f:
datadict = json.loads(data)
f.write(datadict['text'])
return True
except BaseException as e:
print("Not Working")
return True
def on_error(self, status):
print(status)
return True
twitter_stream = Stream(auth, MyListener())
twitter_stream.filter(track=['Search_term_go's_here'])
You might have accessed a tweet that is private. You can check the exception by trying this:
except tweepy.TweepError as e:
print e
print type(e)
print e.__dict__
print e.reason
print type(e.reason)

Tweepy stream acting really slow

I'm trying to stream tweets from twitter using Tweepy for a particular hashtag. The problem that I'm facing is that fetching 500 tweets is taking almost around 10-15 minutes. I don't think it is supposed to be that slow? Am I missing anything? Has it got to do with any API rate limits? My tweepy listener looks like this:
class MyListener(StreamListener):
"""Custom StreamListener for streaming data."""
def __init__(self, lim):
self.count = 0
self.limit = lim
def on_data(self, data):
global tweets
if self.count < self.limit:
try:
self.count += 1
tweets.append(data)
return True
except BaseException, e:
print 'failed ondata,', str(e)
time.sleep(5)
pass
else:
return False
def on_error(self, status):
print(status)
return True
You are trying to fetch live tweets. It means the rate of your collecting tweets is the rate in which people post tweets with that hashtag. You can try your code with a popular or trending hashtag and you will get outputs faster.

Categories

Resources