I am trying to get all 50k tweets from #realDonaldTrump. I know there is limit for twitter api requests, so I am using max_id=oldest. But I only get 995 tweets.
import tweepy as tweepy
consumerKey = "xxx"
consumerSecret = "xxx"
accessToken = "xxx"
accessTokenSecret = "xxx"
auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth, wait_on_rate_limit=True)
alltweets = []
username="#realDonaldTrump"
new_tweets = api.user_timeline(username, tweet_mode = 'extended', count=200)
alltweets.extend(new_tweets)
oldest = alltweets[-1].id - 1
while len(new_tweets) > 0:
print(f"getting tweets before {oldest}")
new_tweets = api.user_timeline(username, max_id=oldest,tweet_mode = 'extended', count=200)
alltweets.extend(new_tweets)
oldest = alltweets[-1].id - 1
print(f"...{len(alltweets)} tweets downloaded so far")
outtweets = [[tweet.id_str, tweet.created_at, tweet.full_text] for tweet in alltweets]
Free the free dev account you wont get more than the last 3200 tweets.
I suggest to use cursor and pages.
..
c = tw.Cursor(api.user_timeline, id=userid, tweet_mode="extended", wait_on_rate_limit=True,count=200).pages()
while True:
try:
page = c.next()
tweets.extend(page)
..
except tw.TweepError:
print(e)
time.sleep(60)
continue
except StopIteration:
break
Related
This is simple code I wrote to scrape data from twitter using tweepy.
import tweepy
import csv
import pandas as pd
from datetime import date
####input your credentials here
consumer_key = '(Hidden)'
consumer_secret = '(Hidden)'
access_token = '(Hidden)'
access_token_secret = '(Hidden)'
today = date.today()
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)
csvFile = open('remotejob.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)
for tweet in tweepy.Cursor(api.search,q="#jobs #remote",count=5000,
lang="en",
since=today,tweet_mode = 'extended').items():
print (tweet.created_at, tweet.id)
csvWriter.writerow([tweet.created_at, tweet.full_text.encode('utf-8'), tweet.id, tweet.user.name.encode('utf-8'), tweet.user.screen_name.encode('utf-8'), tweet.user.statuses_count, tweet.retweet_count, tweet.favorite_count])
How can I schedule it to run every hour automatically?
In Python 3 and tweepy I have this script to do hashtags searches on Twitter:
import tweepy
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
#test = api.get_user('some user')._json
#test
#The test worked
search_result = api.search('#maconhamedicinal' or '#cannabismedicinal')
search_result
[]
The result is an empty list. Please, does anyone know what the problem is?
keywords = ['#maconhamedicinal','#cannabismedicinal']
results = []
for key in keywords:
search_results = api.search(q=key, count=100)
results = results + search_results
for result in results:
# do whatever u wanna do
I am trying to get all tweets from a given account but I can get only last 20 tweets. How can I get all the tweets that user posted ?
Here is my code:
from bs4 import BeautifulSoup as bs
import urllib
#This function returns tweets from
#given username's account as a list
def get_tweets(username):
tweets = []
URL = "https://twitter.com/"+username
soup = bs(urllib.request.urlopen(URL), 'lxml')
for li in soup.find_all("li", {"data-item-type": "tweet"}):
text_p = li.find("p", class_="tweet-text")
if text_p is not None:
tweets.append(text_p.get_text())
return tweets
In Tweepy, you can get the user's timeline like this:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
for status in tweepy.Cursor(api.user_timeline, username).items():
print('status_id: {}, text: {}'.format(status.id, status.text.encode('utf-8')))
Notice that it's using Tweepy.Cursor, which iterates through the list until there aren't any more items.
I did.(Pip install json, Pip install simplejson)
However, errors occur.
simplejson.scanner.JSONDecodeError: Unterminated string starting at:
line 1 column 65922 (char 65921)
tweepy.error.TweepError: Failed to parse JSON payload: Unterminated string starting at: line 1 column 65922 (char 65921)
What should I do?
import tweepy
import time
import os
import json
import simplejson
search_term = 'word1'
search_term2= 'word2'
search_term3='word3'
lat = "xxxx"
lon = "xxxx"
radius = "xxxx"
location = "%s,%s,%s" % (lat, lon, radius)
API_key = "xxxx"
API_secret = "xxxx"
Access_token = "xxxx"
Access_token_secret = "xxxx"
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth)
c=tweepy.Cursor(api.search,
q="{}+OR+{}".format(search_term, search_term2, search_term3),
rpp=1000,
geocode=location,
include_entities=True)
data = {}
i = 1
for tweet in c.items():
data['text'] = tweet.text
print(i, ":", data)
i += 1
time.sleep(1)
wfile = open(os.getcwd()+"/workk2.txt", mode='w')
data = {}
i = 0
for tweet in c.items():
data['text'] = tweet.text
wfile.write(data['text']+'\n')
i += 1
wfile.close()
I'm trying to exclude retweets and replies in my Twython search.
Here is my code:
from twython import Twython, TwythonError
app_key = "xxxx"
app_secret = "xxxx"
oauth_token = "xxxx"
oauth_token_secret = "xxxx"
naughty_words = [" -RT"]
good_words = ["search phrase", "another search phrase"]
filter = " OR ".join(good_words)
blacklist = " -".join(naughty_words)
keywords = filter + blacklist
twitter = Twython(app_key, app_secret, oauth_token, oauth_token_secret)
search_results = twitter.search(q=keywords, count=100)
The problem is that the -RT function isn't really working.
EDIT:
I've tried #forge suggestion, and while it does print the if tweets are not retweets or replies, when I incorporate them into the code below, the bot still finds tweets, retweets, quotes and replies.
twitter = Twython(app_key, app_secret, oauth_token, oauth_token_secret) query = 'beer OR wine AND -filter:retweets AND -filter:replies'
response = twitter.search(q=query, count=100)
statuses = response['statuses']
try:
for tweet in statuses:
try:
twitter.retweet(id = tweet["id_str"])
except TwythonError as e:
print e
except TwythonError as e:
print e
Any ideas? Is there a filter:quotes?
The correct syntax is -filter:retweets.
If you would like to search on terms "search phrase" or "another search phrase" and exclude retweets, then the query should be:
query = "search_phrase OR another_search_phrase -filter:retweets"
To exclude replies as well, add -filter:replies like this:
query = "search_phrase OR another_search_phrase -filter:retweets AND -filter:replies"
This should be working, you can verify it by checking the status fields in_reply_to_status_id and retweeted_status:
Status is not a reply if in_reply_to_status_id is empty
Status is not a retweet if it doesn't have the field retweeted_status
With Twython:
import twython
twitter = twython.Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
query = 'wine OR beer -filter:retweets AND -filter:replies'
response = twitter.search(q=query, count=100)
statuses = response['statuses']
for status in statuses:
print status['in_reply_to_status_id'], status.has_key('retweeted_status')
# Output should be (None, False) to any status