Save specific tweet fields in csv file

Save specific tweet fields in csv file - python

With the following part of code i open a csv file and write the text of a tweet in the first column
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
ckey = ''
csecret = ''
atoken = ''
asecret = ''
class listener(StreamListener):
def on_data(self,data):
try:
#print data
text = data.split(',"text":"')[1].split('","source')[0]
print text
saveThis = str (time.time())+'::'+text
saveFile = open('tweets3.csv','a')
saveFile.write(saveThis)
saveFile.write('\n')
saveFile.close()
except BaseException, e:
print 'failed on data',str(e)
time.sleep(5)
return True
def on_error (self,status):
print status
auth = OAuthHandler (ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
result = twitterStream.filter(track=["zika"], languages=['en'])
my output looks like this
I want to store for each tweet the username of the one that makes it in column B and the number of his followers in column C. Can anybody help?

Related

Get tweets with hashtag from specific time period

I'm new to python programming and Twitter API.
I tired to collect tweets with a hashtag from a specific time period（say 11/24/216-11/27/2017), my goal is to get coordinates from those extracted tweets and save the coordinates and the tweet text into a csv file.
But my problem is that i don't know how to set the time filter and save them into a file. What's more, only a few tweets contained the coordinates, was that common?
Here are the python scripts that i found online.
import json
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
#Enter Twitter API Key information
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''
file = open("C:\\Output.csv", "w") #This script didn't work on my Mac#
strong text
file.write("X,Y\n")
data_list = []
count = 0
class listener(StreamListener):
def on_data(self, data):
global count
#How many tweets you want to find, could change to time based
if count <= 2000:
json_data = json.loads(data)
coords = json_data["coordinates"]
if coords is not None:
print coords["coordinates"]
lon = coords["coordinates"][0]
lat = coords["coordinates"][1]
data_list.append(json_data)
file.write(str(lon) + ",")
file.write(str(lat) + "\n")
count += 1
return True
else:
file.close()
return False
def on_error(self, status):
print status
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
twitterStream = Stream(auth, listener())
#What you want to search for here
twitterStream.filter(track=[""])

Twitter streaming stop collecting data

I've this following code that retrieves Twitter Streaming data and crete a JSON file. What I'd like to get is to stop the data collecting after fo eg.1000 tweets. How can I set the code?
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
# Other libs
import json
#Variables that contains the user credentials to access Twitter API
access_token = "XXX"
access_token_secret = "XXX"
consumer_key = "XXX"
consumer_secret = "XXX"
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
try:
tweet = json.loads(data)
with open('your_data.json', 'a') as my_file:
json.dump(tweet, my_file)
except BaseException:
print('Error')
pass
def on_error(self, status):
print ("Error " + str(status))
if status == 420:
print("Rate Limited")
return False
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
stream.filter(track=['Euro2016', 'FRA', 'POR'], languages=['en'])

Here is a possible solution:
class StdOutListener(StreamListener):
tweet_number=0 # class variable
def __init__(self,max_tweets):
self.max_tweets=max_tweets # max number of tweets
def on_data(self, data):
self.tweet_number+=1
try:
tweet = json.loads(data)
with open('your_data.json', 'a') as my_file:
json.dump(tweet, my_file)
except BaseException:
print('Error')
pass
if self.tweet_number>=self.max_tweets:
sys.exit('Limit of '+str(self.max_tweets)+' tweets reached.')
def on_error(self, status):
print ("Error " + str(status))
if status == 420:
print("Rate Limited")
return False
l = StdOutListener(1000) # Here you can set your maximum number of tweets (1000 in this example)
After having defined the class variable tweet_number, I used the init() method to initialize a new StdOutListener object with the maximum number of tweets you want to collect. tweet_number is increased by 1 each time the on_data(data) method is called, causing the program to terminate when tweet_number>=max_tweets
P.S. You need to import sys for the code to work.

This is the 2.7 code I would use -- sorry, I do not know 3.0 as well... I think you want what is is on my second line. .items(1000) part...?
stackoverflow messed up my indentations in my code. I am also using tweepy.
CODE:
results = []
for tweet in tweepy.Cursor(api.search, q='%INSERT_SEARCH_VARIABLE HERE').items(1000): #THE 1000 IS WHERE YOU SAY SEARCH FOR 1000 TWEETS.
results.append(tweet)
print type(results)
print len(results)
def toDataFrame(tweets):
DataSet = pd.DataFrame()
DataSet['tweetID'] = [tweet.id for tweet in tweets]
DataSet['tweetText'] = [tweet.text for tweet in tweets]
DataSet['tweetRetweetCt'] = [tweet.retweet_count for tweet
in tweets]
DataSet['tweetFavoriteCt'] = [tweet.favorite_count for tweet
in tweets]
DataSet['tweetSource'] = [tweet.source for tweet in tweets]
DataSet['tweetCreated'] = [tweet.created_at for tweet in tweets]
DataSet['userID'] = [tweet.user.id for tweet in tweets]
DataSet['userScreen'] = [tweet.user.screen_name for tweet
in tweets]
DataSet['userName'] = [tweet.user.name for tweet in tweets]
DataSet['userCreateDt'] = [tweet.user.created_at for tweet
in tweets]
DataSet['userDesc'] = [tweet.user.description for tweet in tweets]
DataSet['userFollowerCt'] = [tweet.user.followers_count for tweet
in tweets]
DataSet['userFriendsCt'] = [tweet.user.friends_count for tweet
in tweets]
DataSet['userLocation'] = [tweet.user.location for tweet in tweets]
DataSet['userTimezone'] = [tweet.user.time_zone for tweet
in tweets]
return DataSet
#Pass the tweets list to the above function to create a DataFrame
tweet_frame = toDataFrame(results)
tweet_frame[0:999]

Using split function multiple times with tweepy result in IndexError: list index out of range

I'm using split function and it fails on data and gives me the error:
IndexError: list index out of range
Data equals to:
1457410676.51::RT #robbwolf: Earthquake in Reno! Wacky. 1457410777.98::13:19 #\u5730\u9707 #earthquake https:\/\/t.co\/2X8FaipZsW 1457410814.04::1.7 magnitude #earthquake. 12 mi from Anza, #CA, United States https:\/\/t.co\/1GWexXmLom 1457410819.04::1.7 magnitude #earthquake. 12 mi from #Anza, CA, United States https:\/\/t.co\/fL5MDx7bhS
Code:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
ckey = ''
csecret = ''
atoken = ''
asecret = ''
class Listener(StreamListener):
def on_data(self, data):
try:
tweet = data.split(',"text":"')[1].split('","source')[0]
location = data.split(',"location":"')[1].split('","url')[0]
saveThis=str(time.time())+ '::' + tweet
saveFile = open('locandtext.csv','a')
saveFile.write(saveThis)
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'Failed on data' , str(e)
time.sleep(5)
def on_error(self, status):
print status
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream (auth , Listener())
twitterStream.filter(track=["Earthquake"] )

The value of data doesn't contain the token ,"text":", therefore the result of split is one item list - which includes the entire text.
To avoid IndexError verify ,"text":" appears in data:
token = ',"text":"'
if token in data:
text = data.split(token)[1]

How to encode/decode Tweets in Hindi language from Twitter to display in Hindi font when written to a file?

I am trying to crawl Twitter for Hindi Tweets using Hindi emotion words(eg.खुशी, गुस्सा) to get Tweets with these words using python 2.7. I am using the Streaming API and the code for it is below
import codecs
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
access_token = "xxxxxxxxxxxxxxxx"
access_token_secret = "xxxxxxxxxxxxxxx"
consumer_key = "xxxxxxxxxxxxxxxx"
consumer_secret = "xxxxxxxxxxxxxxxxx"
class StdOutListener(StreamListener):
def on_data(self, data):
print data
saveFile = codecs.open('TweetPrjkhushh.txt', 'a', 'utf-8')
saveFile.write(data)
saveFile.write('\n')
saveFile.close()
return True
def on_error(self, status):
print status
if __name__ == '__main__':
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
t = u"खुशी"
stream.filter(languages=["hi"],track=[t])
I get tweets text in Unicode like this:
{"text":"RT #guru9899: \u092f\u0947 \u092c\u0947\u091c\u093e\u0928 \u0928\u0947 \u092c\u094b\u0932\u093e \u092f\u093e #abpnewshindi \u0915\u0940 \u092e\u0941\u0939\u0940\u092e \u0939\u0948 ??? \u0939\u093e\u0925 \u0935\u093e\u092a\u0938 \u092d\u0940 \u0924\u094b \u0916\u0940\u0902\u091a \u0938\u0915\u0924\u0947 \u0925\u0947 ??? \u091c\u092c\u0930\u0926\u0938\u094d\u0924\u0940 \u0925\u094b\u0921\u093c\u0940 \u0939\u0948 \ud83d\ude02\ud83d\ude02\ud83d\ude02 https:\/\/t.co\/BE0gSEj\u2026"}
I want to display it in Hindi font when we open the file where I am saving the tweets, but using codecs and utf-8 encoding while saving doesn't help. What am I missing here?

data is a dict.
Change your code to specify the key entry of data:
def on_data(self, data):
print data["text"]
saveFile = codecs.open('TweetPrjkhushh.txt', 'a', 'utf-8')
saveFile.write(data["text"])
saveFile.write('\n')
saveFile.close()
return True

how to stream tweets for a particular time period in the past

I am using the following python code to get tweets for a particular topic
import sys
from tweepy import *
import time
import csv
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
class listener(StreamListener):
def on_data(self,data):
try:
saveFile=open('tweetDB2.csv','a')
saveFile.write(data)
saveFile.write('\n')
saveFile.close()
return True
except BaseException as e:
print('failed ondata,',str(e))
time.sleep(60)
def on_error(self,status):
print(status)
auth = OAuthHandler(CONSUMER_KEY,CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN,OAUTH_TOKEN_SECRET)
twitterStream = Stream(auth,listener())
twitterStream.filter(track=["IPL"])
How do I modify the code to get tweets for the same topic but for a different time period (say 2nd week of April,2015)? I went through the API parameters(https://dev.twitter.com/streaming/overview/request-parameters).But I could not find anything with respect to time period. Thanks!

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Save specific tweet fields in csv file - python

Related

Get tweets with hashtag from specific time period

Twitter streaming stop collecting data

Using split function multiple times with tweepy result in IndexError: list index out of range

How to encode/decode Tweets in Hindi language from Twitter to display in Hindi font when written to a file?

how to stream tweets for a particular time period in the past

Categories

Resources