How to decode a text file - python

I have this code here and it work perfectly.
# encoding=utf8
#Import the necessary methods from tweepy library
import sys
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
reload(sys)
sys.setdefaultencoding('utf8')
#Variables that contains the user credentials to access Twitter API
access_token = ""
access_token_secret = ""
consumer_key = ""
consumer_secret = ""
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
#save data
with open('debate_data.txt', 'a') as tf:
tf.write((data).decode('unicode-escape').encode('utf-8'))
return True
def on_error(self, status):
print status
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'Bernier', 'Rossello', 'Bernabe'
stream.filter(track=['Bernier', 'Rosselló', 'Rossello', 'Bernabe', 'Lúgaro', 'Lugaro', 'María de Lourdes', 'Maria de Lourdes', 'Cidre'])
But when I run this other piece of code I get the wrong answer.
import json
import io
#save the tweets to this path
tweets_data_path = 'debate_data.txt'
tweets_data = []
with io.open(tweets_data_path, 'r') as tweets_file:
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print len(tweets_data)
There are 42,188 Tweets on that file, but when I run the code Im only getting 291. I think is something with the encoding/decoding but I cant figure out what. Any help would be greatly appreciate.
I ran this example without any of the encoding/decoding and it worked perfectly.
http://adilmoujahid.com/posts/2014/07/twitter-analytics/

The reason of only getting 291 is the json.loads() throw some errors and except continue it.
I suggest you print the error just like:
except Exception as err:
print err
continue
now you know the error reason, and solve it.
Are you sure the format of data inside debate_data.txt are json ?

As agnewee said, I also recommend:
try:
tweet = json.loads(line)
except Exception as err:
print err # or use log to see what happen
else:
tweets_data.append(tweet)

Related

Using Tweepy to get live tweets from single user

I'm trying to retrieve tweets from a single user_id as per the JSON data tweepy fetches.
The issue is im also retrieving all mentions of the user_id i.e. other people mentioning the user_id in any way (RT, Mentions etc)
I have quoted my script below. Please let me know if this is possible in tweepy at all.
the snippet streamer.filter(follow = ['25073877'], encoding = 'utf8') shows which user i am wanting to follow
Thank you in advance.
from __future__ import print_function
import tweepy
import json
import MySQLdb
from dateutil import parser
import Twitter_API
import DBConfig
Access_Token = ""
Access_Token_Secret = ""
Consumer_Key = ""
Consumer_Secret = ""
class StreamListener(tweepy.StreamListener):
def on_connect(self):
print("You are now connected to the streaming API.")
def on_error(self, status_code):
print('An Error has occured: ' + repr(status_code))
return False
def on_data(self, data):
try:
# Decode the JSON from Twitter
datajson = json.loads(data)
#grab the wanted data from the Tweet
text = datajson['extended_tweet']['full_text']
screen_name = datajson['user']['screen_name']
tweet_id = datajson['id']
created_at = parser.parse(datajson['created_at'])
replying_to = datajson['in_reply_to_screen_name']
#print out a message to the screen that we have collected a tweet
#print("Tweet collected at " + str(created_at))
print(text, screen_name, tweet_id, created_at, replying_to)
#insert the data into the MySQL database
#store_data(created_at, text, screen_name, tweet_id)
except Exception as e:
print(e)
auth = tweepy.OAuthHandler(Consumer_Key, Consumer_Secret)
auth.set_access_token(Access_Token, Access_Token_Secret)
#Set up the listener. The 'wait_on_rate_limit=True' is needed to help with Twitter API rate limiting.
listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True))
streamer = tweepy.Stream(auth=auth, listener=listener)
#print("Tracking: " + str(WORDS))
streamer.filter(follow = ['25073877'], encoding = 'utf8')
Sorry for the bad indentation if any.

Twitter streaming formatting JSON Output

Maybe you can help me. This following python code retrieves Twitter Streaming data and stops when 1000 tweet data are got. It works but returns the fields "created_at, screen_name, and text" separated by tab. Instead I'd like to get the data in JSON format. How can I set the code in order to get the data formatted in JSON?
# Import the necessary package to process data in JSON format
try:
import json
except ImportError:
import simplejson as json
# Import the necessary methods from "twitter" library
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream
# Variables that contains the user credentials to access Twitter API
CONSUMER_KEY = '7pWHWtYlXM9ayJfUKv2F8v84B'
CONSUMER_SECRET = 'Dfcx10Px77Ggn0qGbCHc4TZC7M2IHsXpqk9CaGiCLzcr9VMX5n'
ACCESS_TOKEN = '245080367-zuLrIbxblOnocashgku9dsmDKgy3R7uU0VCTIRDx'
ACCESS_SECRET = 'wCx5ufD9Zft46hVjieLdv0af7p9DxUTsPgge9Zm2qelR9'
oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
# Initiate the connection to Twitter Streaming API
twitter_stream = TwitterStream(auth=oauth)
# Get a sample of the public data following through Twitter
#iterator = twitter_stream.statuses.sample()
iterator = twitter_stream.statuses.filter(track="Euro2016", language="fr")
tweet_count = 1000
for tweet in iterator:
tweet_count -= 1
print (tweet['created_at'],"\t",tweet['user']['screen_name'],"\t",tweet['geo'], "\t",tweet['text'])
if tweet_count <= 0:
break
You can import tweepy (you need to install it first with pip) and override the listener class to be able to output the data in json format. Here is an example:
from tweepy import Stream
from tweepy.streaming import StreamListener
#Listener Class Override
class listener(StreamListener):
def on_data(self, data):
try:
tweet = json.loads(data)
with open('your_data.json', 'a') as my_file:
json.dump(tweet, my_file)
except BaseException:
print('Error')
pass
def on_error(self, status):
print(statuses)
my_listener=listener()
twitterStream = Stream(oauth, my_listener) #Inizialize Stream object
You can read more about tweepy here: http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html

Error handling In Tweepy

I am learning python and have started out a few weeks ago. I have tried to write a code to check for tweets with a particular hashtag in the streaming API and then reply to the tweet in case the a tweet has not been posted to the handle previously. While running the code, I have tried to avoid overstepping the rate limitations so as to not get any error. But there is an issue of duplicate status that Twitter raises once in a while. I would like the code to keep running and not stop on encountering an issue. Please help in this. The following is the code:
import tweepy
from tweepy import Stream
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
import json
import time
consumer_key =
consumer_secret =
access_token =
access_secret =
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
def check(status):
datafile = file('C:\Users\User\Desktop\Growth Handles.txt', 'r')
found = False
for line in datafile:
if status.user.screen_name in line:
found = True
break
return found
class MyListener(StreamListener):
def on_status(self, status):
f=status.user.screen_name
if check(status) :
pass
else:
Append=open('Growth Handles.txt' , 'a' )
Append.write(f + "\n")
Append.close()
Reply='#' + f + ' Check out Tomorrowland 2014 Setlist . http://.... '
api = tweepy.API(auth)
api.update_status(status=Reply)
time.sleep(45)
return True
def on_error(self, status):
print(status)
return True
twitter_stream = Stream(auth, MyListener())
twitter_stream.filter(track=['#musiclovers'])
In case, update_status method throws an error
try:
api.update_status(status=Reply)
except:
pass
In case twitter_stream gets disconnected.
twitter_stream = Stream(auth, MyListener())
while True:
twitter_stream.filter(track=['#musiclovers'])
Warning - Your app may got banned if it reaches certain limits, or their system caught you spamming. Check Twitter Rules

Twitter Stream not working for Tweepy

Using the code below, I'm trying to get a hash tag. It works fine for larger searches like #StarWars, but when i ask for smaller ones it doesn't seem to return anything.
Ideas?
'code' is used instead of the actual strings for authentication
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from textwrap import TextWrapper
import json
access_token = "code"
access_token_secret = "code"
consumer_key = "code"
consumer_secret = "code"
class StdOutListener(StreamListener):
''' Handles data received from the stream. '''
status_wrapper = TextWrapper(width=60, initial_indent=' ', subsequent_indent=' ')
def on_status(self, status):
try:
print self.status_wrapper.fill(status.text)
print '\n %s %s via %s\n' % (status.author.screen_name, status.created_at, status.source)
except:
# Catch any unicode errors while printing to console
# and just ignore them to avoid breaking application.
pass
def on_error(self, status_code):
print('Got an error with status code: ' + str(status_code))
return True # To continue listening
def on_timeout(self):
print('Timeout...')
return True # To continue listening
if __name__ == '__main__':
listener = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, listener)
stream.filter(track=['#TestingPythonTweet'])
Ok, so found that the answer to this is that i was expecting it to work retro-actively. This was a fundamental error on my part. Instead what actually happens is that it gets what's currently being tweeted. Not was has been previously.

Downloading all Tweets about certain subject in Python

Im doing Twitter sentiment research at the moment. For this reason, I'm using the Twitter API to download all tweets on certain keywords. But my current code is taking a lot of time to create a large datafile, so I was wondering if there's a faster method.
This is what Im using right now:
__author__ = 'gerbuiker'
import time
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
#Variables that contains the user credentials to access Twitter API
access_token = "XXXXXXXXXXXXX"
access_token_secret = "XXXXXXXX"
consumer_key = "XXXXX"
consumer_secret = "XXXXXXXXXXXXXX"
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
try:
#print data
tweet = data.split(',"text":"')[1].split('","source')[0]
print tweet
saveThis = str(time.time())+ '::'+ tweet #saves time+actual tweet
saveFile = open('twitiamsterdam.txt','a')
saveFile.write(saveThis)
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'failed ondata,',str(e)
time.sleep(5)
def on_error(self, status):
print status
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'Amsterdam'
stream.filter(track=['KEYWORD which i want to check'])
This gets me about 1500 tweets in one hour, for a pretty popular keyword (Amsterdam). Does anyone now a faster method in Python?
To be clear: I want to download all tweets on a certain subject for last month/year for example. So the newest tweets don't have to keep coming in, the most recent ones for a period would be sufficient. Thanks!
I need something similar to this for an academic research.
We're you able to fix it?
Would it be possible to specify a custom range of time from which to pull the data?
Sorry for asking here, but couldn't send you private messages.

Categories

Resources