I've this following code that retrieves Twitter Streaming data and crete a JSON file. What I'd like to get is to stop the data collecting after fo eg.1000 tweets. How can I set the code?
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
# Other libs
import json
#Variables that contains the user credentials to access Twitter API
access_token = "XXX"
access_token_secret = "XXX"
consumer_key = "XXX"
consumer_secret = "XXX"
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
try:
tweet = json.loads(data)
with open('your_data.json', 'a') as my_file:
json.dump(tweet, my_file)
except BaseException:
print('Error')
pass
def on_error(self, status):
print ("Error " + str(status))
if status == 420:
print("Rate Limited")
return False
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
stream.filter(track=['Euro2016', 'FRA', 'POR'], languages=['en'])
Here is a possible solution:
class StdOutListener(StreamListener):
tweet_number=0 # class variable
def __init__(self,max_tweets):
self.max_tweets=max_tweets # max number of tweets
def on_data(self, data):
self.tweet_number+=1
try:
tweet = json.loads(data)
with open('your_data.json', 'a') as my_file:
json.dump(tweet, my_file)
except BaseException:
print('Error')
pass
if self.tweet_number>=self.max_tweets:
sys.exit('Limit of '+str(self.max_tweets)+' tweets reached.')
def on_error(self, status):
print ("Error " + str(status))
if status == 420:
print("Rate Limited")
return False
l = StdOutListener(1000) # Here you can set your maximum number of tweets (1000 in this example)
After having defined the class variable tweet_number, I used the init() method to initialize a new StdOutListener object with the maximum number of tweets you want to collect. tweet_number is increased by 1 each time the on_data(data) method is called, causing the program to terminate when tweet_number>=max_tweets
P.S. You need to import sys for the code to work.
This is the 2.7 code I would use -- sorry, I do not know 3.0 as well... I think you want what is is on my second line. .items(1000) part...?
stackoverflow messed up my indentations in my code. I am also using tweepy.
CODE:
results = []
for tweet in tweepy.Cursor(api.search, q='%INSERT_SEARCH_VARIABLE HERE').items(1000): #THE 1000 IS WHERE YOU SAY SEARCH FOR 1000 TWEETS.
results.append(tweet)
print type(results)
print len(results)
def toDataFrame(tweets):
DataSet = pd.DataFrame()
DataSet['tweetID'] = [tweet.id for tweet in tweets]
DataSet['tweetText'] = [tweet.text for tweet in tweets]
DataSet['tweetRetweetCt'] = [tweet.retweet_count for tweet
in tweets]
DataSet['tweetFavoriteCt'] = [tweet.favorite_count for tweet
in tweets]
DataSet['tweetSource'] = [tweet.source for tweet in tweets]
DataSet['tweetCreated'] = [tweet.created_at for tweet in tweets]
DataSet['userID'] = [tweet.user.id for tweet in tweets]
DataSet['userScreen'] = [tweet.user.screen_name for tweet
in tweets]
DataSet['userName'] = [tweet.user.name for tweet in tweets]
DataSet['userCreateDt'] = [tweet.user.created_at for tweet
in tweets]
DataSet['userDesc'] = [tweet.user.description for tweet in tweets]
DataSet['userFollowerCt'] = [tweet.user.followers_count for tweet
in tweets]
DataSet['userFriendsCt'] = [tweet.user.friends_count for tweet
in tweets]
DataSet['userLocation'] = [tweet.user.location for tweet in tweets]
DataSet['userTimezone'] = [tweet.user.time_zone for tweet
in tweets]
return DataSet
#Pass the tweets list to the above function to create a DataFrame
tweet_frame = toDataFrame(results)
tweet_frame[0:999]
Related
I am learning twitter sentiment analysis in python using matplotlib. Currently the results i.e (1 for positive, 0 for neutral and -1 for negative) are showing in console. I want to display the console data results in a graphical form using matplotlib. I want someone to please help me out displaying the data in graphical form using matplotlib. Anyone please help me out?
The code I have is listed below.
from tweepy import API
from tweepy import Cursor
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from textblob import TextBlob
import twitter_credentials
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
## # # TWITTER CLIENT # # # #
class TwitterClient:
def __init__(self, twitter_user=None):
self.auth = TwitterAuthenticator().authenticate_twitter_app()
self.twitter_client = API(self.auth)
self.twitter_user = twitter_user
def get_twitter_client_api(self):
return self.twitter_client
def get_user_timeline_tweets(self, num_tweets):
tweets = []
for tweet in Cursor(self.twitter_client.user_timeline,
id=self.twitter_user).items(num_tweets):
tweets.append(tweet)
return tweets
def get_friend_list(self, num_friends):
friend_list = []
for friend in Cursor(self.twitter_client.friends,
id=self.twitter_user).items(num_friends):
friend_list.append(friend)
return friend_list
def get_home_timeline_tweets(self, num_tweets):
home_timeline_tweets = []
for tweet in Cursor(self.twitter_client.home_timeline,
id=self.twitter_user).items(num_tweets):
home_timeline_tweets.append(tweet)
return home_timeline_tweets
## # # TWITTER AUTHENTICATER # # # #
class TwitterAuthenticator:
def authenticate_twitter_app(self):
auth = OAuthHandler(twitter_credentials.CONSUMER_KEY,
twitter_credentials.CONSUMER_SECRET)
auth.set_access_token(twitter_credentials.ACCESS_TOKEN,
twitter_credentials.ACCESS_TOKEN_SECRET)
return auth
## # # TWITTER STREAMER # # # #
class TwitterStreamer:
"""
Class for streaming and processing live tweets.
"""
def __init__(self):
self.twitter_autenticator = TwitterAuthenticator()
def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
# This handles Twitter authetification and the connection to Twitter Streaming API
listener = TwitterListener(fetched_tweets_filename)
auth = self.twitter_autenticator.authenticate_twitter_app()
stream = Stream(auth, listener)
# This line filter Twitter Streams to capture data by the keywords:
stream.filter(track=hash_tag_list)
## # # TWITTER STREAM LISTENER # # # #
class TwitterListener(StreamListener):
"""
This is a basic listener that just prints received tweets to stdout.
"""
def __init__(self, fetched_tweets_filename):
self.fetched_tweets_filename = fetched_tweets_filename
def on_data(self, data):
try:
print data
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException, e:
print 'Error on_data %s' % str(e)
return True
def on_error(self, status):
if status == 420:
# Returning False on_data method in case rate limit occurs.
return False
print status
class TweetAnalyzer:
"""
Functionality for analyzing and categorizing content from tweets.
"""
def clean_tweet(self, tweet):
return ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"
, ' ', tweet).split())
def analyze_sentiment(self, tweet):
analysis = TextBlob(self.clean_tweet(tweet))
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
def tweets_to_data_frame(self, tweets):
df = pd.DataFrame(data=[tweet.text for tweet in tweets],
columns=['tweets'])
df['id'] = np.array([tweet.id for tweet in tweets])
df['len'] = np.array([len(tweet.text) for tweet in tweets])
df['date'] = np.array([tweet.created_at for tweet in tweets])
df['source'] = np.array([tweet.source for tweet in tweets])
df['likes'] = np.array([tweet.favorite_count for tweet in
tweets])
df['retweets'] = np.array([tweet.retweet_count for tweet in
tweets])
return df
if __name__ == '__main__':
twitter_client = TwitterClient()
tweet_analyzer = TweetAnalyzer()
api = twitter_client.get_twitter_client_api()
tweets = api.user_timeline(screen_name='realDonaldTrump', count=200)
df = tweet_analyzer.tweets_to_data_frame(tweets)
df['sentiment'] = np.array([tweet_analyzer.analyze_sentiment(tweet)
for tweet in df['tweets']])
print df.head(10)
I'm new to python programming and Twitter API.
I tired to collect tweets with a hashtag from a specific time period(say 11/24/216-11/27/2017), my goal is to get coordinates from those extracted tweets and save the coordinates and the tweet text into a csv file.
But my problem is that i don't know how to set the time filter and save them into a file. What's more, only a few tweets contained the coordinates, was that common?
Here are the python scripts that i found online.
import json
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
#Enter Twitter API Key information
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''
file = open("C:\\Output.csv", "w") #This script didn't work on my Mac#
strong text
file.write("X,Y\n")
data_list = []
count = 0
class listener(StreamListener):
def on_data(self, data):
global count
#How many tweets you want to find, could change to time based
if count <= 2000:
json_data = json.loads(data)
coords = json_data["coordinates"]
if coords is not None:
print coords["coordinates"]
lon = coords["coordinates"][0]
lat = coords["coordinates"][1]
data_list.append(json_data)
file.write(str(lon) + ",")
file.write(str(lat) + "\n")
count += 1
return True
else:
file.close()
return False
def on_error(self, status):
print status
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitterStream = Stream(auth, listener())
#What you want to search for here
twitterStream.filter(track=[""])
With the following part of code i open a csv file and write the text of a tweet in the first column
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
ckey = ''
csecret = ''
atoken = ''
asecret = ''
class listener(StreamListener):
def on_data(self,data):
try:
#print data
text = data.split(',"text":"')[1].split('","source')[0]
print text
saveThis = str (time.time())+'::'+text
saveFile = open('tweets3.csv','a')
saveFile.write(saveThis)
saveFile.write('\n')
saveFile.close()
except BaseException, e:
print 'failed on data',str(e)
time.sleep(5)
return True
def on_error (self,status):
print status
auth = OAuthHandler (ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
result = twitterStream.filter(track=["zika"], languages=['en'])
my output looks like this
I want to store for each tweet the username of the one that makes it in column B and the number of his followers in column C. Can anybody help?
I'm using split function and it fails on data and gives me the error:
IndexError: list index out of range
Data equals to:
1457410676.51::RT #robbwolf: Earthquake in Reno! Wacky. 1457410777.98::13:19 #\u5730\u9707 #earthquake https:\/\/t.co\/2X8FaipZsW 1457410814.04::1.7 magnitude #earthquake. 12 mi from Anza, #CA, United States https:\/\/t.co\/1GWexXmLom 1457410819.04::1.7 magnitude #earthquake. 12 mi from #Anza, CA, United States https:\/\/t.co\/fL5MDx7bhS
Code:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
ckey = ''
csecret = ''
atoken = ''
asecret = ''
class Listener(StreamListener):
def on_data(self, data):
try:
tweet = data.split(',"text":"')[1].split('","source')[0]
location = data.split(',"location":"')[1].split('","url')[0]
saveThis=str(time.time())+ '::' + tweet
saveFile = open('locandtext.csv','a')
saveFile.write(saveThis)
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'Failed on data' , str(e)
time.sleep(5)
def on_error(self, status):
print status
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream (auth , Listener())
twitterStream.filter(track=["Earthquake"] )
The value of data doesn't contain the token ,"text":", therefore the result of split is one item list - which includes the entire text.
To avoid IndexError verify ,"text":" appears in data:
token = ',"text":"'
if token in data:
text = data.split(token)[1]
I am using the following python code to get tweets for a particular topic
import sys
from tweepy import *
import time
import csv
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
class listener(StreamListener):
def on_data(self,data):
try:
saveFile=open('tweetDB2.csv','a')
saveFile.write(data)
saveFile.write('\n')
saveFile.close()
return True
except BaseException as e:
print('failed ondata,',str(e))
time.sleep(60)
def on_error(self,status):
print(status)
auth = OAuthHandler(CONSUMER_KEY,CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN,OAUTH_TOKEN_SECRET)
twitterStream = Stream(auth,listener())
twitterStream.filter(track=["IPL"])
How do I modify the code to get tweets for the same topic but for a different time period (say 2nd week of April,2015)? I went through the API parameters(https://dev.twitter.com/streaming/overview/request-parameters).But I could not find anything with respect to time period. Thanks!