Maybe you can help me. This following python code retrieves Twitter Streaming data and stops when 1000 tweet data are got. It works but returns the fields "created_at, screen_name, and text" separated by tab. Instead I'd like to get the data in JSON format. How can I set the code in order to get the data formatted in JSON?
# Import the necessary package to process data in JSON format
try:
import json
except ImportError:
import simplejson as json
# Import the necessary methods from "twitter" library
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream
# Variables that contains the user credentials to access Twitter API
CONSUMER_KEY = '7pWHWtYlXM9ayJfUKv2F8v84B'
CONSUMER_SECRET = 'Dfcx10Px77Ggn0qGbCHc4TZC7M2IHsXpqk9CaGiCLzcr9VMX5n'
ACCESS_TOKEN = '245080367-zuLrIbxblOnocashgku9dsmDKgy3R7uU0VCTIRDx'
ACCESS_SECRET = 'wCx5ufD9Zft46hVjieLdv0af7p9DxUTsPgge9Zm2qelR9'
oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
# Initiate the connection to Twitter Streaming API
twitter_stream = TwitterStream(auth=oauth)
# Get a sample of the public data following through Twitter
#iterator = twitter_stream.statuses.sample()
iterator = twitter_stream.statuses.filter(track="Euro2016", language="fr")
tweet_count = 1000
for tweet in iterator:
tweet_count -= 1
print (tweet['created_at'],"\t",tweet['user']['screen_name'],"\t",tweet['geo'], "\t",tweet['text'])
if tweet_count <= 0:
break
You can import tweepy (you need to install it first with pip) and override the listener class to be able to output the data in json format. Here is an example:
from tweepy import Stream
from tweepy.streaming import StreamListener
#Listener Class Override
class listener(StreamListener):
def on_data(self, data):
try:
tweet = json.loads(data)
with open('your_data.json', 'a') as my_file:
json.dump(tweet, my_file)
except BaseException:
print('Error')
pass
def on_error(self, status):
print(statuses)
my_listener=listener()
twitterStream = Stream(oauth, my_listener) #Inizialize Stream object
You can read more about tweepy here: http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html
Related
I am trying to fetch user metadata using tweepy by user screen name and save the result as JSON file. Here is my code
import tweepy
from tweepy import Stream
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
CONSUMER_KEY = 'xxx'
CONSUMER_SECRET = 'xxx'
ACCESS_KEY = 'xxx'
ACCESS_SECRET = 'xxx'
auth = OAuthHandler(CONSUMER_KEY,CONSUMER_SECRET)
api = tweepy.API(auth)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
class TweetListener(StreamListener):
# A listener handles tweets are the received from the stream.
#This is a basic listener that just prints received tweets to standard output
def on_data(self, data):
print (data)
return True
def on_error(self, status):
print (status)
#search
api = tweepy.API(auth)
twitterStream = Stream(auth,TweetListener())
#name is list contains user screen names
test = api.lookup_users(screen_names= name)
for user in test:
print (user.screen_name)
print (user.name)
print (user.description)
print (user.followers_count)
print (user.statuses_count)
my code runs without any error and as you can see, I am printing the data but my intend is to save them in JSON file
I tried different cods but nothing works for me. So please any help?
JSON is just a way to save data in specific format.
In order to save your data, you first need to store it properly, and then just dump it.
the format is dict, that holds data separated by keys, and each key is a map.
in your case, I chose 'users' as a key.
each user in 'users' have keys (name, descripition, etc) and values from your list.
you need to do something like that:
import json
data = {'users': []}
for user in test:
data['users'].append({
'screen_name': user.screen_name,
'name': user.name,
'description': user.description,
'followers_count': user.followers_count,
'statuses_count': user.statuses_count
})
with open('data.txt', 'w') as outfile:
json.dump(data, outfile)
Hi I have looked at many guides and tutorials on how to do this, but I am having trouble with being able to use tweepy to store the JSON data in a text file.
class StreamListener(tweepy.StreamListener):
def on_status(self, status):
print(status)
def on_error(self, status):
print status
if status == 420:
return False
if __name__ == '__main__':
stream_listener = StreamListener()
auth = tweepy.OAuthHandler(consumer_token, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = tweepy.Stream(auth, stream_listener)
I have another python file which is supposed to read data into a list:
import pandas
import json
json_data = 'twitter_data.txt'
data_list = []
#load file
tweets_file = open(json_data, "r")
for line in tweets_file:
try:
tweet = json.loads(line) #this line causes problems
data_list.append(tweet)
except:
continue
print len(data_list)
I thought the data received from twitter comes in JSON format, and the guides I'm following all say it does, but it's actually in another object.
Should I just store everything in a list then json dump that list into the new text file?
It seems like you're on the right track. You can modify the stream listener to write tweets to a file directly.
Edit: this now writes out in JSON format.
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API
#Variables that contains the user credentials to access Twitter API
CONSUMER_KEY = #YOUR CONSUMER KEY
CONSUMER_SECRET = #YOUR CONSUMER SECRET
ACCESS_TOKEN = #YOUR ACCESS TOKEN
ACCESS_TOKEN_SECRET = #YOUR ACCESS TOKEN SECRET
class FileWriteListener(StreamListener):
def __init__(self):
super(StreamListener, self).__init__()
self.save_file = open('tweets.json','w')
self.tweets = []
def on_data(self, tweet):
self.tweets.append(json.loads(tweet))
self.save_file.write(str(tweet))
def on_error(self, status):
print(status)
return True
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = API(auth)
twitter_stream = Stream(auth, MyListener())
# Here you can filter the stream by:
# - keywords (as shown)
# - users
twitter_stream.filter(track=['hello'])
This code will run indefinitely, so you either need to exit the process after some time (Ctrl-C) or modify the code.
Then you can load the data:
import json
json_data = []
with open('tweets.json','r') as f:
json_data.append( json.loads(f.readline()) )
Hope this helps!
I think something like this may be what your looking for.
def on_status(self, tweet):
json_dumps = json.dumps(tweet._json)
tweet_json = json.loads(json_dumps)
print(tweet_json['created_at'])
These are all the keys you can use in the tweet_json[ ]
dict_keys(['created_at', 'id', 'id_str', 'text', 'source', 'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 'quote_count', 'reply_count', 'retweet_count', 'favorite_count', 'entities', 'favorited', 'retweeted', 'filter_level', 'lang', 'timestamp_ms'])
This code only stream data. I want to pull data for a 3 days time frame.
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
# will pass key and token
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
class TweetsListener( StreamListener ):
def on_data(self, data):
try:
with open('name.json', 'a') as f:
f.write(data)
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
def on_error(self, status):
print(status)
return True
twitter_stream = Stream( auth, TweetsListener() )
twitter_stream.filter( track=['XX'] )
You are using Twitter Streaming API which is used to capture future tweets with given criteria (on_data is called when a tweet with given search terms is posted). You need to use Twitter REST APIs to search previous data. See tweepy.Search()
Parameters:
q – the search query string
since_id – Returns only
statuses with an ID greater than (that is, more recent than) the
specified ID.
I've been playing with the Twitter Streaming API using the Tweepy library. I started by following my own account and streaming my own tweets as I posted them, which worked fine.
I then attempted to stream a fairly large region's tweets ([30,-85,31,-84]), to which I initially seemed to receive no data. I then started receiving 'Location Deletion Notices', or 'scrub_geo' messages, and have only ever received those since. I changed my code back to the previously working follow code, but I continue to receive 'scrub_geo' messages and not statuses from my profile.
Here's the script I'm using:
# Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
# Other libs
import json
# Variables that contains the user credentials to access Twitter API
access_token = "<my_access_token>"
access_token_secret = "<my_secret_token>"
consumer_key = "<my_consumer_key>"
consumer_secret = "<my_consumer_secret>"
# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
#try:
# json_data = json.loads(data)
# print json_data['created_at'] + " " + data['text']
#except:
print "Data " + str(data)
return True
def on_error(self, status):
print "Error " + str(status)
if status == 420:
print("420 error.")
return False
if __name__ == '__main__':
# This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
# Start streaming with right parameters
#tallahassee=[30,-85,31,-84]
#stream.filter(locations=tallahassee) <---- previously used
stream.filter(follow="<my_user_id>")
Your coordinates are reversed. Since we're dealing with GeoJSON always do (long,lat,alt) or (x,y,z)
So you'll need to provide tallahassee=[-85,30,-84,31]. Always provide longitude first same as you would do (x,y) in math.
There are some places, like google maps, that do latitude first. You just have to be careful as to which proper format you're dealing with.
Im doing Twitter sentiment research at the moment. For this reason, I'm using the Twitter API to download all tweets on certain keywords. But my current code is taking a lot of time to create a large datafile, so I was wondering if there's a faster method.
This is what Im using right now:
__author__ = 'gerbuiker'
import time
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
#Variables that contains the user credentials to access Twitter API
access_token = "XXXXXXXXXXXXX"
access_token_secret = "XXXXXXXX"
consumer_key = "XXXXX"
consumer_secret = "XXXXXXXXXXXXXX"
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
try:
#print data
tweet = data.split(',"text":"')[1].split('","source')[0]
print tweet
saveThis = str(time.time())+ '::'+ tweet #saves time+actual tweet
saveFile = open('twitiamsterdam.txt','a')
saveFile.write(saveThis)
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'failed ondata,',str(e)
time.sleep(5)
def on_error(self, status):
print status
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'Amsterdam'
stream.filter(track=['KEYWORD which i want to check'])
This gets me about 1500 tweets in one hour, for a pretty popular keyword (Amsterdam). Does anyone now a faster method in Python?
To be clear: I want to download all tweets on a certain subject for last month/year for example. So the newest tweets don't have to keep coming in, the most recent ones for a period would be sufficient. Thanks!
I need something similar to this for an academic research.
We're you able to fix it?
Would it be possible to specify a custom range of time from which to pull the data?
Sorry for asking here, but couldn't send you private messages.