I am trying to pull tweets using since_id and max_id. The problem with since_id is that it correctly prints all the tweets in my eclipse console but when I try to store it line by line as csv file it does not return me all the tweets. I tried running it 4-5 times but every time I get different number of tweets.The problem with max_id is that it is not running. My code is as follows and the traceback is included after the code (For since_id I am just replacing max_id with since_id only)
#!/usr/bin/python
import tweepy
import csv
from datetime import *
import time
access_token = ''
access_secret = ''
consumer_key = ''
consumer_secret = ''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
#data = api.get_user('abhi1868sharma')#'mishra1_P_K'
csvFile = open('a.csv','a')
csvWriter = csv.writer(csvFile, delimiter=',')#, tweet.favourited
i = 1
tweets = tweepy.Cursor(api.user_timeline, id = '', max_id = 510064587115225000).items()
while True:
try:
for tweet in tweets:
csvWriter.writerow([i, tweet.retweet_count, tweet.favorite_count, str(tweet.id), tweet.created_at.hour,tweet.created_at.minute,tweet.created_at.weekday(),tweet.created_at.day,tweet.created_at.month,tweet.created_at.year,tweet.created_at ,tweet.text.encode('utf8'), tweet.user.id, tweet.geo, tweet.in_reply_to_user_id, tweet.in_reply_to_status_id_str, tweet.place, tweet.retweeted, tweet.truncated, tweet.source])
print i
i+=1
except tweepy.TweepError:
time.sleep(60 * 15)
continue
except StopIteration:
break
csvFile.close()
This is my traceback for the max_id (for since_id it is not throwing any error)
for tweet in tweets:
File "C:\Python27\lib\site-packages\tweepy\cursor.py", line 181, in next
self.current_page = self.page_iterator.next()
File "C:\Python27\lib\site-packages\tweepy\cursor.py", line 99, in next
data = self.method(max_id=self.max_id, parser=RawParser(), *self.args, **self.kargs)
TypeError: _call() got multiple values for keyword argument 'max_id'
One more thing as I am storing the file in csv file my tweet_ids get rounded off. Is there any way to get around it. And the tweet_id that I am using for since_id and max_id are also rounded out tweet ids.
Related
I am trying to get all tweets within a certain radius around given coordinates. The script actually works but zero entries are returned. The strange thing is that exactly the same code worked for me a few days ago and now it does not and I am stuck :(
import tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import pandas as pd
import tweepy
#Twitter credentials for the app
consumer_key = 'xxx'
consumer_secret = 'xxx'
access_key= 'xxx'
access_secret = 'xxx'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
#Create list for column names
COLS = ['id','created_at','lang','original text','user_name', 'place', 'place type', 'bbx', 'coordinates']
geo='48.136353, 11.575004, 25km'
def write_tweets(keyword):
#create dataframe from defined column list
df = pd.DataFrame(columns=COLS)
#iterate through pages with given condition
#using tweepy.Cursor object with items() method
for page in tweepy.Cursor(api.search, q=keyword,
include_rts=False,
geocode=geo).pages():
for tweet in page:
#creating string array
new_entry = []
#storing all JSON data from twitter API
tweet = tweet._json
#Append the JSON parsed data to the string list:
new_entry += [tweet['id'], tweet['created_at'], tweet['lang'], tweet['text'],
tweet['user']['name']]
#check if place name is available, in case not the entry is named 'no place'
try:
place = tweet['place']['name']
except TypeError:
place = 'no place'
new_entry.append(place)
try:
place_type = tweet['place']['place_type']
except TypeError:
place_type = 'na'
new_entry.append(place_type)
try:
bbx = tweet['place']['bounding_box']['coordinates']
except TypeError:
bbx = 'na'
new_entry.append(bbx)
#check if coordinates is available, in case not the entry is named 'no coordinates'
try:
coord = tweet['coordinates']['coordinates']
except TypeError:
coord = 'no coordinates'
new_entry.append(coord)
# wrap up all the data into a data frame
single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
df = df.append(single_tweet_df, ignore_index=True)
#get rid of tweets without a place
df_cleaned = df[df.place != 'no place']
print("tweets with place:")
print(len(df[df.place != 'no place']))
print("tweets with coordinates:")
print(len(df[df.coordinates != 'no coordinates']))
df_cleaned.to_csv('tweets_'+geo+'.csv', columns=COLS,index=False)
#declare keywords as a query
keyword='*'
#call main method passing keywords and file path
write_tweets(keyword)
The geocode actually should work like this.
Does anyone have an idea?
When you declare variable geo don't leave any spaces between the comma and the numbers.
It should look like this:
geo='48.136353,11.575004,25km'
here is my code..i want to extract tweets from twitter with some keywords....my code dont give any errors but i am not getting the output file generated...please help me........
import re
import csv
import tweepy
from tweepy import OAuthHandler
#TextBlob perform simple natural language processing tasks.
from textblob import TextBlob
def search():
#text = e.get() **************************
consumer_key = ''
consumer_secret = ''
access_token = ' '
access_token_secret = ' '
# create OAuthHandler object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
api = tweepy.API(auth)
def get_tweets(query, count = 300):
# empty list to store parsed tweets
tweets = []
target = open("tweets.txt", 'w',encoding="utf-8")
t1 = open("review.txt", 'w',encoding="utf-8")
# call twitter api to fetch tweets
q=str(query)
a=str(q+" sarcasm")
b=str(q+" sarcastic")
c=str(q+" irony")
fetched_tweets = api.search(a, count = count)+ api.search(b, count = count)+ api.search(c, count = count)
# parsing tweets one by one
print(len(fetched_tweets))
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
if "http" not in tweet.text:
line = re.sub("[^A-Za-z]", " ", tweet.text)
target.write(line+"\n")
t1.write(line+"\n")
return tweets
# creating object of TwitterClient Class
# calling function to get tweets
tweets = get_tweets(query =text, count = 20000)
root.mainloop()
From this code i am nor getting the output generated file. Can anyone tell me what i am doing wrong ?
Thanks in advance!
I just made some slight changes and it was working perfectly for me. Removed or commented some unnecessary statements (like the review file). Changed the open function to io.open since I have python version 2.7. Here is the running code, hope it helps!!
`
import re
import io
import csv
import tweepy
from tweepy import OAuthHandler
#TextBlob perform simple natural language processing tasks.
#from textblob import TextBlob
consumer_key = 'sz6x0nvL0ls9wacR64MZu23z4'
consumer_secret = 'ofeGnzduikcHX6iaQMqBCIJ666m6nXAQACIAXMJaFhmC6rjRmT'
access_token = '854004678127910913-PUPfQYxIjpBWjXOgE25kys8kmDJdY0G'
access_token_secret = 'BC2TxbhKXkdkZ91DXofF7GX8p2JNfbpHqhshW1bwQkgxN'
# create OAuthHandler object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
api = tweepy.API(auth)
def get_tweets(query, count = 300):
# empty list to store parsed tweets
tweets = []
target = io.open("mytweets.txt", 'w', encoding='utf-8')
# call twitter api to fetch tweets
q=str(query)
a=str(q+" sarcasm")
b=str(q+" sarcastic")
c=str(q+" irony")
fetched_tweets = api.search(a, count = count)+ api.search(b, count = count)+ api.search(c, count = count)
# parsing tweets one by one
print(len(fetched_tweets))
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
if "http" not in tweet.text:
line = re.sub("[^A-Za-z]", " ", tweet.text)
target.write(line+"\n")
return tweets
# creating object of TwitterClient Class
# calling function to get tweets
tweets = get_tweets(query ="", count = 20000)
`
I'm trying to extract tweets using the following code, and I just realized I'm only getting the first 140 characters. I'm a bit new at this and now I need to put tweet_mode=extended and full_text somewhere, so if someone could point out exactly where I'd be very appreciative. Thank you!
#!/usr/bin/env python
encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import csv
#Twitter API credentials
consumer_key = "5f55VEYRnHuBvVESy11OrBayI"
consumer_secret = "r0PcvNast4FLYD1HNQiJIsIDGtk72hhVFPzR3BfrIWfuSn2SWD"
access_key = "949748064985722880-Wpc3hErpGEeDC75MBfcDoo07X9WVcAo"
access_secret = "w02RdHMg1izgaFlKUJH3C5s9cDNue2h8XJv87E3TE0Whm"
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with
this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum
allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200,)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name =
screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
#transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-
8"),tweet.retweet_count,tweet.favorite_count] for tweet in alltweets]
#write the csv
with open('%s_tweets.csv' % screen_name, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["id","created_at","full_text","retweet_count","favorite_count"])
writer.writerows(outtweets)
pass
if __name__ == '__main__':
#pass in the username of the account you want to download
get_all_tweets("realdonaldtrump")
Put "tweet_mode=extended" here:
new_tweets = api.user_timeline(screen_name = screen_name,
count=200,
tweet_mode=extended)
And here:
while len(new_tweets) > 0:
new_tweets = api.user_timeline(screen_name = screen_name,
count=200,
max_id=oldest,
tweet_mode=extended)
Put "full_tweet" here:
outtweets = [[tweet.id_str,
tweet.created_at,
tweet.full_tweet.encode("utf-8"),
tweet.retweet_count,
tweet.favorite_count] for tweet in alltweets]
I am trying to stream live tweets with a given hashtag using tweepy library. I am using the following code taken from https://galeascience.wordpress.com/2016/03/18/collecting-twitter-data-with-python/
I am new to python coding and APIs
import tweepy
from tweepy import OAuthHandler
import json
import datetime as dt
import time
import os
import sys
def load_api():
''' Function that loads the twitter API after authorizing the user. '''
consumer_key = 'xxx'
consumer_secret = 'xxx'
access_token = 'yyy'
access_secret = 'yyy'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
# load the twitter API via tweepy
return tweepy.API(auth)
def tweet_search(api, query, max_tweets, max_id, since_id, geocode):
''' Function that takes in a search string 'query', the maximum
number of tweets 'max_tweets', and the minimum (i.e., starting)
tweet id. It returns a list of tweepy.models.Status objects. '''
searched_tweets = []
while len(searched_tweets) < max_tweets:
remaining_tweets = max_tweets - len(searched_tweets)
try:
new_tweets = api.search(q=query, count=remaining_tweets,
since_id=str(since_id),
max_id=str(max_id-1))
# geocode=geocode)
print('found',len(new_tweets),'tweets')
if not new_tweets:
print('no tweets found')
break
searched_tweets.extend(new_tweets)
max_id = new_tweets[-1].id
except tweepy.TweepError:
print('exception raised, waiting 15 minutes')
print('(until:', dt.datetime.now()+dt.timedelta(minutes=15), ')')
time.sleep(15*60)
break # stop the loop
return searched_tweets, max_id
def get_tweet_id(api, date='', days_ago=7, query='a'):
''' Function that gets the ID of a tweet. This ID can then be
used as a 'starting point' from which to search. The query is
required and has been set to a commonly used word by default.
The variable 'days_ago' has been initialized to the maximum
amount we are able to search back in time (9).'''
if date:
# return an ID from the start of the given day
td = date + dt.timedelta(days=1)
tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
tweet = api.search(q=query, count=1, until=tweet_date)
else:
# return an ID from __ days ago
td = dt.datetime.now() - dt.timedelta(days=days_ago)
tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
# get list of up to 10 tweets
tweet = api.search(q=query, count=10, until=tweet_date)
print('search limit (start/stop):',tweet[0].created_at)
# return the id of the first tweet in the list
return tweet[0].id
def write_tweets(tweets, filename):
''' Function that appends tweets to a file. '''
with open(filename, 'a') as f:
for tweet in tweets:
json.dump(tweet._json, f)
f.write('\n')
def main():
''' This is a script that continuously searches for tweets
that were created over a given number of days. The search
dates and search phrase can be changed below. '''
''' search variables: '''
search_phrases = ['#Messi']
time_limit = 1.5 # runtime limit in hours
max_tweets = 200 # number of tweets per search (will be
# iterated over) - maximum is 100
min_days_old, max_days_old = 1, 5 # search limits e.g., from 7 to 8
# gives current weekday from last week,
# min_days_old=0 will search from right now
# loop over search items,
# creating a new file for each
for search_phrase in search_phrases:
print('Search phrase =', search_phrase)
''' other variables '''
name = search_phrase.split()[0]
json_file_root = name + '/' + name
os.makedirs(os.path.dirname(json_file_root), exist_ok=True)
read_IDs = False
# open a file in which to store the tweets
if max_days_old - min_days_old == 1:
d = dt.datetime.now() - dt.timedelta(days=min_days_old)
day = '{0}-{1:0>2}-{2:0>2}'.format(d.year, d.month, d.day)
else:
d1 = dt.datetime.now() - dt.timedelta(days=max_days_old-1)
d2 = dt.datetime.now() - dt.timedelta(days=min_days_old)
day = '{0}-{1:0>2}-{2:0>2}_to_{3}-{4:0>2}-{5:0>2}'.format(
d1.year, d1.month, d1.day, d2.year, d2.month, d2.day)
json_file = json_file_root + '_' + day + '.json'
if os.path.isfile(json_file):
print('Appending tweets to file named: ',json_file)
read_IDs = True
# authorize and load the twitter API
api = load_api()
# set the 'starting point' ID for tweet collection
if read_IDs:
# open the json file and get the latest tweet ID
with open(json_file, 'r') as f:
lines = f.readlines()
max_id = json.loads(lines[-1])['id']
print('Searching from the bottom ID in file')
else:
# get the ID of a tweet that is min_days_old
if min_days_old == 0:
max_id = -1
else:
max_id = get_tweet_id(api, days_ago=(min_days_old-1))
# set the smallest ID to search for
since_id = get_tweet_id(api, days_ago=(max_days_old-1))
print('max id (starting point) =', max_id)
print('since id (ending point) =', since_id)
''' tweet gathering loop '''
start = dt.datetime.now()
end = start + dt.timedelta(hours=time_limit)
count, exitcount = 0, 0
while dt.datetime.now() < end:
count += 1
print('count =',count)
# collect tweets and update max_id
tweets, max_id = tweet_search(api, search_phrase, max_tweets,
max_id=max_id, since_id=since_id,
geocode=USA)
# write tweets to file in JSON format
if tweets:
write_tweets(tweets, json_file)
exitcount = 0
else:
exitcount += 1
if exitcount == 3:
if search_phrase == search_phrases[-1]:
sys.exit('Maximum number of empty tweet strings reached - exiting')
else:
print('Maximum number of empty tweet strings reached - breaking')
break
if __name__ == "__main__":
main()
It throws the following error:
Traceback (most recent call last):
File "search.py", line 189, in <module>
main()
File "search.py", line 157, in main
since_id = get_tweet_id(api, days_ago=(max_days_old-1))
File "search.py", line 80, in get_tweet_id
tweet = api.search(q=query, count=10, until=tweet_date)
File "/usr/local/lib/python3.5/dist-packages/tweepy/binder.py", line 245, in _call
return method.execute()
File "/usr/local/lib/python3.5/dist-packages/tweepy/binder.py", line 229, in execute
raise TweepError(error_msg, resp, api_code=api_error_code)
tweepy.error.TweepError: [{'code': 215, 'message': 'Bad Authentication data.'}]
I entered the relevant tokens but still it doesn't work. Any help will be appreciated.
It's rare, but it happens sometimes that the application keys need to be regenerated because of something (?) on the back end. I don't know if that's your issue, but it's worth trying.
Also, you are not actually streaming tweets. There is another request for that. You are using Twitter's REST API for searching for tweets that have already occurred.
i need your help, i am trying to export the data mining by tweepy to an xlsx file by xlsxwriter, but it is giving error. How can I solve?
EDIT 1:
(One Solution, by user Eli Lopez)
Its worked, but wrote one line... How to solve this?
Or others solutions?
(There are many tweets caught, I need to export in xlsx.)
This is my code:
import json
import csv
import xlsxwriter
import tweepy
from tweepy import OAuthHandler
consumer_key = "my_key"
consumer_secret = "my_key"
access_token = "my_key"
access_token_secret = "my_key"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
def tweet_to_xlsx(tweet):
tweet_list = []
tweet_list.append([tweet.user.screen_name, tweet.text])
# tweet_list.append(tweet.text)
workbook = xlsxwriter.Workbook('tweet.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0
for user, tweet in tweet_list:
worksheet.write(row, col, user)
worksheet.write(row, col + 1, tweet)
row += 1
workbook.close()
results = api.search(q=name, lang=lang, count=tweetCount)
for tweet in results:
print(tweet.user.screen_name, "Twittou:", tweet.text)
tweet_to_xlsx(tweet)
Error:
Traceback (most recent call last):
File "extTwitter.py", line 113, in <module>
tweet_to_xlsx(tweet)
File "extTwitter.py", line 60, in tweet_to_xlsx
for user, tweet in tweet_list:
ValueError: too many values to unpack (expected 2)
When you are appending, you're appending items not lists
tweet_list = [USER, TWEET USER, TWEET]
what you want is a list of lists
tweet_list = [[USER, TWEET], [USER, TWEET]]
What your code should be:
tweet_list.append([tweet.user.screen_name, tweet.text])
# you could also use () instead of [] as tuples are faster