Twitter API - not collecting all tweets using Tweepy - python

I'm using Tweepy to collect tweets from the Twitter API by their Tweet ID.
Im trying to read in a file full of the IDs, get the previous tweet from the conversation stream, then store that tweet and its author's screen name etc in a text file. Some of the tweets have been deleted or the user's profile has been set to private, in which case I want to ignore that tweet and move on to the next. However, for some reason, I'm not collecting all accessible tweets. Its storing maybe 3/4 of all tweets that aren't private and haven't been deleted. Any ideas why its not catching everything?
Thanks in advance.
def getTweet(tweetID, tweetObj, callTweetObj, i):
tweet = callTweetObj.text.encode("utf8")
callUserName = callTweetObj.user.screen_name
callTweetID = tweetObj.in_reply_to_status_id_str
with open("call_tweets.txt", "a") as calltweets:
output = (callTweetObj.text.encode('utf-8')+ "\t" + callTweetID + "\t" + tweetID)
calltweets.write(output)
print output
with open("callauthors.txt", "a") as callauthors:
cauthors = (callUserName+ "\t" + "\t" + callTweetID + "\n")
callauthors.write(cauthors)
with open("callIDs.txt", "a") as callIDs:
callIDs.write(callTweetID + "\n")
with open("newResponseIDs.txt", "a") as responseIDs:
responseIDs.write(tweetID)
count = 0
file = "Response_IDs.txt"
with open(file, 'r+') as f:
lines = f.readlines()
for i in range(0, len(lines)):
tweetID = lines[i]
sleep(5)
try:
tweetObj = api.get_status(tweetID)
callTweetID = tweetObj.in_reply_to_status_id_str
callTweetObj = api.get_status(callTweetID)
getTweet(tweetID, tweetObj, callTweetObj, i)
count = count+1
print count
except:
pass

You haven't specified information regarding the response coming back from api.get_status, so it's hard to detect what the error is.
However, it might be you have reached the rate limit for the statuses/show/:id request. The API specifies this request is limited to 180 requests a window.
You can use Tweepy to call application/rate_limit_status:
response = api.rate_limit_status()
remaining = response['resources']['statuses']['/statuses/show/:id']['remaining']
assert remaining > 0

Related

python loop for calling API

I need to get Geo data for a bunch of IPs (eventually I will need data for 3k+ IPs). I was able to successfully get Geo data for individual IPs. Now I'm truing to create a loop which iterates through IPs stored as separate lines in a text file and then calls ipstack API for getting Geo data. But the code returns data only for last IP in the file with 'missing_access_key' error for the other ones. I'm a python beginner - so any help would be appreciated.
fh = open('IPs.txt')
for line in fh:
ip = line
api = 'http://api.ipstack.com/' + ip + '?access_key=' + access_key
result = urllib.request.urlopen(api).read()
result = result.decode()
result = json.loads(result)
print (result)
fh = open('IPs.txt,'r')
Lines = fh.readlines()
for line in Lines:
ip = line
api = 'http://api.ipstack.com/' + ip + '?access_key=' + access_key
result = urllib.request.urlopen(api).read()
result = result.decode()
result = json.loads(result)
print (result)

How to check for image use in tweets in Tweepy

I have written code to extract tweets from a list of users [handles]. I am writing the information to a .txt file called "results".
with open("results", "w") as fp:
for handle in handles:
print("Analyzing tweets from " + handle + "...")
user = api.get_user(id=handle)
fp.write("Handle: " + handle + "\n")
fp.write("Name: " + user.name + "\n")
fp.write("Description: " + str(user.description.encode(sys.stdout.encoding, errors='replace')) + "\n")
fp.write("Followers: " + str(user.followers_count) + "\n")
fp.write("Following: " + str(user.friends_count) + "\n")
tweet_counter = 0
prosocial_tweets_count = 0
regular_tweets_count = 0
all_tweets = []
social_tweets_len = []
regular_tweets_len = []
social_tweets_valence = []
regular_tweets_valence = []
regular_attachments = 0
social_attachments = 0
for tweet in tweepy.Cursor(api.user_timeline, id=user.id).items():
#control for timeline
dt = tweet.created_at
if dt > date_until:
continue
if dt < date_from:
break # XXX: I hope it's OK to break here
if include_retweets == "no" and tweet.text.startswith("RT"):
continue
if include_replies == "no" and tweet.in_reply_to_user_id:
continue
tweet_counter += 1
for word in vocabulary:
if word in tweet.text.lower():
#increase count of pro social tweets
prosocial_tweets_count += 1
#clean the tweet for valence analysis
clean = TextBlob(tweet.text.lower())
#calculate valence
valence = clean.sentiment.polarity
#append the valence to a list
social_tweets_valence.append(valence)
#append the length of the tweet to a list
social_tweets_len.append(len(tweet.text))
#check if there is an attachment
counting = tweet.text.lower()
counting_attachments = counting.count(" https://t.co/")
social_attachments = social_attachments + counting_attachments
#write date
fp.write(" * " + str(dt) + "\n")
#write the tweet
fp.write(" " + str(tweet.text.encode(sys.stdout.encoding, errors='replace')) + "\n")
#write the length of the tweet
fp.write(" Length of tweet " + str(len(tweet.text)) + "\n")
#write the valence of the tweet
fp.write(" Tweet valance " + str(valence) + "\n")
#write the retweets of the tweet
fp.write(" Retweets count: " + str(tweet.retweet_count) + "\n")
#write the likes of the tweet
fp.write(" Likes count: " + str(tweet.favorite_count) + "\n")
# Report each tweet only once whenever it contains more than one prosocial words
break
else:
#this code runs if the tweet is not prosocial
regular_tweets_count += 1
clean = TextBlob(tweet.text.lower())
valence = clean.sentiment.polarity
counting = tweet.text.lower()
counting_attachments = counting.count(" https://t.co/")
regular_attachments = regular_attachments + counting_attachments
regular_tweets_valence.append(valence)
regular_tweets_len.append(len(tweet.text))
attachments = regular_attachments + social_attachments
I was wondering whether anyone knows of any nice way to check if the tweets contains images or videos. I would also like to create a list of average use of images and videos per user.
If you look at This thread, you will see that all media in a tweet are actually stored in tweet.entities['media'].
Therefore if you want to know if a given tweet (in the format tweepy.models.Status used by tweepy) contains a picture, you could try this:
try:
print(True in [medium['type'] == 'photo' for medium in tweet.entities['media']])
except:
print("No picture in this tweet")
I hope it helps.
Data is in JSON format when we fetch it from Twitter API. Though it contains all data about that id, and comment in form of value and fields. So if you just want to check whether image already exist or not you make a conditional statement stating
if(image == TRUE){
THEN 'yes'
}
ELSE
'no'

Tweepy only returning 1 tweet

Ive got a python script intending to scrape tweets from twitter and append them to a csv file. Im using the tweepy module however it is only returning 1 tweet. Is this a problem with my for loop, or with the call to the twitter API?
for status in tweepy.Cursor(twitterapi.search,q="labour party",since="2018-05-01", until="2018-05-10").items(200):
if 'RT' not in status.text:
with open('C:/Users/User/Desktop/twittersentiment.csv', 'wb') as f:
w = csv.writer(f)
favourites = status.user.favourites_count
location = status.user.location.encode('utf8')
tweet_text = ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",status.text.encode('utf8')).split())
date = status.created_at.strftime('%m/%d/%Y')
a = [location]
b=[favourites]
c=[tweet_text]
d=[date]
zip(a,b,c,d)
w.writerow(zip(a,b,c,d))
You should open the file before you start iterating the tweepy.Cursor otherwise each iteration of the cursor will create a new file with one entry, overwriting the previous file.
with open('C:/Users/User/Desktop/twittersentiment.csv', 'wb') as f:
w = csv.writer(f)
for status in tweepy.Cursor(twitterapi.search,q="labour party",since="2018-05-01", until="2018-05-10").items(200):
if 'RT' not in status.text:
favourites = status.user.favourites_count
location = status.user.location.encode('utf8')
tweet_text = ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",status.text.encode('utf8')).split())
date = status.created_at.strftime('%m/%d/%Y')
a = [location]
b=[favourites]
c=[tweet_text]
d=[date]
zip(a,b,c,d)
w.writerow(zip(a,b,c,d))

Save Tweets to python dictionary [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 6 years ago.
Improve this question
I want to analyse twitter data.I have downloaded some tweets and saved them in a .txt file.
When I tried to extract useful information from the tweets data , i was not able to make any progress because for a beginner like me it seems very difficult to extract tweets , location etc.
while googling i found if we convert json into dictionary it would be easy to extract the info.
Now I want to convert my JSON data to python dictionaries. I don't know how to proceed.
Here is the code used to save tweets
import tweepy
import json
import jsonpickle
consumer_key = "*********"
consumer_secret = "*******"
access_token = "************"
access_token_secret = "**********"
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# It make the Tweepy API call auto wait (sleep) when it hits the rate limit and continue upon expiry of the window.
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
if (not api):
print ("Can't Authenticate")
sys.exit(-1)
searchQuery = 'SomeHashtag'
maxTweets = 10000000 # Some arbitrary large number
tweetsPerQry = 100
fName = 'file.txt'
sinceId = None
max_id = "Latest tweet ID"
tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'a') as f:
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
new_tweets = api.search(q=searchQuery, lang ="en", count=tweetsPerQry)
else:
new_tweets = api.search(q=searchQuery, lang ="en", count=tweetsPerQry,
since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=searchQuery, lang ="en", count=tweetsPerQry,
max_id=str(max_id - 1))
else:
new_tweets = api.search(q=searchQuery, lang ="en", count=tweetsPerQry,
max_id=str(max_id - 1),
since_id=sinceId)
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) + '\n')
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
# Just exit if any error
print("some error : " + str(e))
break
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))
It seems you can just read your file line by line and unpickle it using jsonpickle.decode method:
tweets = []
with open(filename) as f:
for line in f:
tweets.append(jsonpickle.decode(line))
And I think you can bypass third-party library at all:
import json
with open(filename, 'w') as f:
for tweet in new_tweets:
f.write(json.dumps(tweet) + '\n')
tweets = []
with open(filename) as f:
for line in f:
tweets.append(json.loads(line))

Python 3: How to create a text progress bar for downloading files?

I have currently this:
def download_dropbox(url, pre_file_name):
file = url[42:]
file = file[:-5]
file_name = pre_file_name + file
print('Downloading from ' + url + ' to ' + file_name)
print(file)
u = urllib.request.urlopen(url)
data = u.read()
u.close()
with open(file_name, "wb") as f:
f.write(data)
print('Download Completed from ' + url + ' and saved to ' + file_name)
This basically downloads files from dropbox and saves it to a directory. However I want to be able to have some sort of text progress bar like:
[==== ]50%
OR
50%
The hard part i would think is doing it with any external modules like the loading bar module, etc. Also, as the title states, I need it in python 3. Thank-you.
Edit:
Thanks to Martin Evans for the data read while loop and progress bar here is the end result of the code:
#Get the total number of bytes of the file to download before downloading
print ("opening url:", url)
u = urllib.request.urlopen(url)
meta = u.info()
print(str(meta).split())
metaInfo = str(meta).split()
print(len(metaInfo))
print ("Content-Length:" + metaInfo[46] + " bytes")
fileTotalbytes=int(metaInfo[46])
data_blocks = []
# total = int(metaInfo[46])
total=0
while True:
block = u.read(1024)
data_blocks.append(block)
total += len(block)
hash = ((60*total)//fileTotalbytes)
print("[{}{}] {}%".format('#' * hash, ' ' * (60-hash), int(total/fileTotalbytes*100)), end="\r")
if not len(block):
break
data=b''.join(data_blocks) #had to add b because I was joining bytes not strings
u.close()
with open('test.zip', "wb") as f:
f.write(data)
To answer your main question, how to make a text progress bar, you could use something like the following to give you an idea:
import time
for n in range(1,101):
hash = ((60*n)//100)
print("[{}{}] {}%".format('#' * hash, ' ' * (60-hash), n), end="\r")
time.sleep(0.05)
This would give you the following:
[########################### ] 45%
Your main problem though is that there is no obvious way to determine how many bytes will eventually be downloaded unless you already know the exact size of the item being downloaded beforehand. If you control the server end then you could arrange for the length to be obtained before starting.
You can though start by at least converting your read() line to something like the following:
u = urllib.request.urlopen(url)
data_blocks = []
total = 0
while True:
block = fd.read(1024)
data_blocks.append(block)
total += len(block)
print("Downloaded {} bytes".format(total), end="\r")
if not len(block):
break
data = "".join(data_blocks)
u.close()
By doing it this way, you read it a bit at a time and can then provide feedback.
You can use print with \r at the start to go to the start of the line and write over the previous text (so you need to write spaces if you want to clear a character). Here's a simple example:
from time import sleep
x = 0
while x < 20:
print('\r' + '.' * x, end="")
x += 1
sleep(0.1)

Categories

Resources