Python Tweepy get all tweets based on Geocode - python

I am trying to get all tweets within a certain radius around given coordinates. The script actually works but zero entries are returned. The strange thing is that exactly the same code worked for me a few days ago and now it does not and I am stuck :(
import tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import pandas as pd
import tweepy
#Twitter credentials for the app
consumer_key = 'xxx'
consumer_secret = 'xxx'
access_key= 'xxx'
access_secret = 'xxx'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
#Create list for column names
COLS = ['id','created_at','lang','original text','user_name', 'place', 'place type', 'bbx', 'coordinates']
geo='48.136353, 11.575004, 25km'
def write_tweets(keyword):
#create dataframe from defined column list
df = pd.DataFrame(columns=COLS)
#iterate through pages with given condition
#using tweepy.Cursor object with items() method
for page in tweepy.Cursor(api.search, q=keyword,
include_rts=False,
geocode=geo).pages():
for tweet in page:
#creating string array
new_entry = []
#storing all JSON data from twitter API
tweet = tweet._json
#Append the JSON parsed data to the string list:
new_entry += [tweet['id'], tweet['created_at'], tweet['lang'], tweet['text'],
tweet['user']['name']]
#check if place name is available, in case not the entry is named 'no place'
try:
place = tweet['place']['name']
except TypeError:
place = 'no place'
new_entry.append(place)
try:
place_type = tweet['place']['place_type']
except TypeError:
place_type = 'na'
new_entry.append(place_type)
try:
bbx = tweet['place']['bounding_box']['coordinates']
except TypeError:
bbx = 'na'
new_entry.append(bbx)
#check if coordinates is available, in case not the entry is named 'no coordinates'
try:
coord = tweet['coordinates']['coordinates']
except TypeError:
coord = 'no coordinates'
new_entry.append(coord)
# wrap up all the data into a data frame
single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
df = df.append(single_tweet_df, ignore_index=True)
#get rid of tweets without a place
df_cleaned = df[df.place != 'no place']
print("tweets with place:")
print(len(df[df.place != 'no place']))
print("tweets with coordinates:")
print(len(df[df.coordinates != 'no coordinates']))
df_cleaned.to_csv('tweets_'+geo+'.csv', columns=COLS,index=False)
#declare keywords as a query
keyword='*'
#call main method passing keywords and file path
write_tweets(keyword)
The geocode actually should work like this.
Does anyone have an idea?

When you declare variable geo don't leave any spaces between the comma and the numbers.
It should look like this:
geo='48.136353,11.575004,25km'

Related

Tweepy returns same tweets when scraping data repeatedly

I am scraping data from Twitter for tweets, since Twitter has a limitation on this, I am scraping 2500 tweets data every 15 minutes, however, I observe that each run after 15 minutes is returning me the same tweets. Is there any way how I can skip the previously scraped tweet data using some offset.
Thank You!
Here is my code:
# Import libraries
from tweepy import OAuthHandler
#from tweepy.streaming import StreamListener
import tweepy
import csv
import pandas as pd
#import re
#from textblob import TextBlob
#import string
#import preprocessor as p
#import os
import time
# Twitter credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
def extract_tweets(search_words,date_since,numTweets):
return(tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets))
def scrapetweets(search_words, date_since, numTweets, numRuns):
# Define a pandas dataframe to store the date:
db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following', 'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'text', 'hashtags'])
#db_tweets = pd.DataFrame()
for i in range(numRuns):
tweets = extract_tweets(search_words,date_since,numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
print(len(tweet_list))
noTweets = 0
for tweet in tweet_list:
username = tweet.user.screen_name
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
lst=[]
for h in hashtags:
lst.append(h['text'])
try:
text = tweet.retweeted_status.full_text
except AttributeError: # Not a Retweet
text = tweet.full_text
itweet = [username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,lst]
db_tweets.loc[len(db_tweets)] = itweet
noTweets += 1
print(noTweets,itweet)
#filename = "tweets.csv"
#with open(filename, "a", newline='') as fp:
# wr = csv.writer(fp, dialect='excel')
# wr.writerow(itweet)
print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
if i+1 != numRuns:
time.sleep(920)
filename = "tweets.csv"
# Store dataframe in csv with creation date timestamp
db_tweets.to_csv(filename, mode='a', index = False)
# Initialise these variables:
search_words = "#India OR #COVID-19"
date_since = "2020-04-29"
#date_until = "2020-05-01"
numTweets = 2500
numRuns = 10
# Call the function scrapetweets
program_start = time.time()
scrapetweets(search_words, date_since, numTweets, numRuns)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))
I referred to a blog on medium for this purpose.
you can add a variable as validator an store it to a file that may be a tweetid.txt
and each time you run the script, you open di tweetid.txt
if tweetid same in tweet id in txt, you pass it.

Index error list page out of range on danbooru twitter bot

i am using a code that i found in github, i had to modify somethings, it works, but sometimes (even when working) it gives Index error page out of range and then stop working.
File "bot.py", line 36, in module
imageSource = pageTable[arrayNum]["file_url"]
IndexError: list index out of range
Here is my code
import time
import requests
import tweepy
import urllib
import os
import random
page = 1
url = 'https://danbooru.donmai.us/posts.json?tags=shimakaze_(kantai_collection) rating:s&limit=1000&page='
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
while True:
try:
random.seed()
jsURL = url + str(random.randint(1,1000))
response = requests.get(jsURL)
pageTable = response.json()
arrayNum = random.randint(0,5)
print arrayNum
imageSource = pageTable[arrayNum]["file_url"]
imageURL = imageSource
print imageURL
sourceURL = "http://danbooru.donmai.us/posts/" + str(pageTable[arrayNum]["id"])
print sourceURL
urllib.urlretrieve(imageURL, 'image.jpg')
.
tweetString = sourceURL + " "
api.update_with_media('image.jpg', status=tweetString)
os.remove('image.jpg')
post. Limited to 500 requests/hour.
time.sleep(600)
except tweepy.error.TweepError:
print "Image too large, finding a different image.."
arrayNum = random.randint(0,5) line gives the error, that code generates a 0 - 5 numbers, and use as danbooru page, so i don't know why it gives an IndexError
It seems the response you get maybe empty sometimes. I've tried (which can be a possibility within your random range)
https://danbooru.donmai.us/posts.json?tags=shimakaze_(kantai_collection)%20rating:s&limit=1000&page=796
and get a response with empty list. So calling on an index with empty list will give you the index error. Check if the response is not empty.

Tweepy error: unhashable type slice

I am trying to use the tweepy library to collect some data from twitter to conduct some sentiment analysis.
Here is a sample of the script I am running:
import tweepy
import pandas as pd
import numpy as np
auth = tweepy.OAuthHandler(CONSUMER_KEY,CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth, parser = tweepy.parsers.JSONParser())
# Set search query
searchquery = '"atiku" -filter:retweets'
data = api.search(q = searchquery, count = 100, lang = 'en', result_type = 'mixed')
data_all = list(data.values())[0]
#(data.values())[1])
while (len(data_all) <= 20000):
time.sleep(5)
last = data_all[1]['id']
data = api.search(q = searchquery, count = 100, lang = 'en', result_type = 'mixed', max_id = last)
data_all += list(data.values())[1][1:]
print(data_all)
I have hit a road block in my code, as when I run it I get this error:
TypeError: unhashable type: 'slice'
I would appreciate any pointers on thos

extract tweets with some special keywords from twitter using tweepy in python

here is my code..i want to extract tweets from twitter with some keywords....my code dont give any errors but i am not getting the output file generated...please help me........
import re
import csv
import tweepy
from tweepy import OAuthHandler
#TextBlob perform simple natural language processing tasks.
from textblob import TextBlob
def search():
#text = e.get() **************************
consumer_key = ''
consumer_secret = ''
access_token = ' '
access_token_secret = ' '
# create OAuthHandler object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
api = tweepy.API(auth)
def get_tweets(query, count = 300):
# empty list to store parsed tweets
tweets = []
target = open("tweets.txt", 'w',encoding="utf-8")
t1 = open("review.txt", 'w',encoding="utf-8")
# call twitter api to fetch tweets
q=str(query)
a=str(q+" sarcasm")
b=str(q+" sarcastic")
c=str(q+" irony")
fetched_tweets = api.search(a, count = count)+ api.search(b, count = count)+ api.search(c, count = count)
# parsing tweets one by one
print(len(fetched_tweets))
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
if "http" not in tweet.text:
line = re.sub("[^A-Za-z]", " ", tweet.text)
target.write(line+"\n")
t1.write(line+"\n")
return tweets
# creating object of TwitterClient Class
# calling function to get tweets
tweets = get_tweets(query =text, count = 20000)
root.mainloop()
From this code i am nor getting the output generated file. Can anyone tell me what i am doing wrong ?
Thanks in advance!
I just made some slight changes and it was working perfectly for me. Removed or commented some unnecessary statements (like the review file). Changed the open function to io.open since I have python version 2.7. Here is the running code, hope it helps!!
`
import re
import io
import csv
import tweepy
from tweepy import OAuthHandler
#TextBlob perform simple natural language processing tasks.
#from textblob import TextBlob
consumer_key = 'sz6x0nvL0ls9wacR64MZu23z4'
consumer_secret = 'ofeGnzduikcHX6iaQMqBCIJ666m6nXAQACIAXMJaFhmC6rjRmT'
access_token = '854004678127910913-PUPfQYxIjpBWjXOgE25kys8kmDJdY0G'
access_token_secret = 'BC2TxbhKXkdkZ91DXofF7GX8p2JNfbpHqhshW1bwQkgxN'
# create OAuthHandler object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# set access token and secret
auth.set_access_token(access_token, access_token_secret)
# create tweepy API object to fetch tweets
api = tweepy.API(auth)
def get_tweets(query, count = 300):
# empty list to store parsed tweets
tweets = []
target = io.open("mytweets.txt", 'w', encoding='utf-8')
# call twitter api to fetch tweets
q=str(query)
a=str(q+" sarcasm")
b=str(q+" sarcastic")
c=str(q+" irony")
fetched_tweets = api.search(a, count = count)+ api.search(b, count = count)+ api.search(c, count = count)
# parsing tweets one by one
print(len(fetched_tweets))
for tweet in fetched_tweets:
# empty dictionary to store required params of a tweet
parsed_tweet = {}
# saving text of tweet
parsed_tweet['text'] = tweet.text
if "http" not in tweet.text:
line = re.sub("[^A-Za-z]", " ", tweet.text)
target.write(line+"\n")
return tweets
# creating object of TwitterClient Class
# calling function to get tweets
tweets = get_tweets(query ="", count = 20000)
`

How to export data from tweepy, to xlsx?

i need your help, i am trying to export the data mining by tweepy to an xlsx file by xlsxwriter, but it is giving error. How can I solve?
EDIT 1:
(One Solution, by user Eli Lopez)
Its worked, but wrote one line... How to solve this?
Or others solutions?
(There are many tweets caught, I need to export in xlsx.)
This is my code:
import json
import csv
import xlsxwriter
import tweepy
from tweepy import OAuthHandler
consumer_key = "my_key"
consumer_secret = "my_key"
access_token = "my_key"
access_token_secret = "my_key"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
def tweet_to_xlsx(tweet):
tweet_list = []
tweet_list.append([tweet.user.screen_name, tweet.text])
# tweet_list.append(tweet.text)
workbook = xlsxwriter.Workbook('tweet.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0
for user, tweet in tweet_list:
worksheet.write(row, col, user)
worksheet.write(row, col + 1, tweet)
row += 1
workbook.close()
results = api.search(q=name, lang=lang, count=tweetCount)
for tweet in results:
print(tweet.user.screen_name, "Twittou:", tweet.text)
tweet_to_xlsx(tweet)
Error:
Traceback (most recent call last):
File "extTwitter.py", line 113, in <module>
tweet_to_xlsx(tweet)
File "extTwitter.py", line 60, in tweet_to_xlsx
for user, tweet in tweet_list:
ValueError: too many values to unpack (expected 2)
When you are appending, you're appending items not lists
tweet_list = [USER, TWEET USER, TWEET]
what you want is a list of lists
tweet_list = [[USER, TWEET], [USER, TWEET]]
What your code should be:
tweet_list.append([tweet.user.screen_name, tweet.text])
# you could also use () instead of [] as tuples are faster

Categories

Resources