I am trying to create a function in python to generate and find tweets. But when i call this defined function I get a strange error saying min() arg is an empty sequence. I have attached the definition and the calling functions below.
import re
import json
import xml
import numpy as np
from collections import Counter
!pip install TwitterAPI
from TwitterAPI import TwitterAPI
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import requests
# disabling urllib3 warnings
requests.packages.urllib3.disable_warnings()
import matplotlib.pyplot as plt
%matplotlib inline
keywords = ["donkey", "condition", "motion"] # INSERT YOUR CODE
print(keywords)
ONSUMER_KEY = "xL6gbYUG6uNteXxtaVzSopZOe"
CONSUMER_SECRET =
"tvH5dzMtDeZ5u8Tvw2fxq8sCnWU07KenVXcNAEplKBD4uu6AlG"
OAUTH_TOKEN = "1260044898767495170-i60LmXDg7XtEGaiBiTtBXNhSTtRHVk"
OAUTH_TOKEN_SECRET = "L7yCVjWRB9MJhJL0kmnc5DWHlgYfaslrPWLybcXZyswkv"
# Authenticating with your application credentials
api = TwitterAPI(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN,
OAUTH_TOKEN_SECRET, api_version='2')
print(api)
PLACE_LAT = 32.7767 # INSERT YOUR CODE
PLACE_LON = -96.797 # INSERT YOUR CODE
DELTA_LAT = 1.0
DELTA_LON = 1.0
def retrieve_tweets(api, keyword, batch_count, total_count, latitude,
longitude, delta_lat, delta_lon):
"""
collects tweets using the Twitter search API
api: Twitter API instance
keyword: search keyword
batch_count: maximum number of tweets to collect per each request
total_count: maximum number of tweets in total
latitude: latitude of the location centre from where tweets
are retrieved
longitude: longitude of the location centre from where tweets
are retrieved
delta_lat: latitude delta (from the location centre) in degrees
delta_lon: longitude delta (from the location centre) in
degrees
"""
# the collection of tweets to be returned
tweets_unfiltered = []
tweets = []
# the number of tweets within a single query
batch_count = str(batch_count)
'''
You are required to insert your own code where instructed to
perform the first query to Twitter API.
Hint: revise the practical session on Twitter API on how to
perform query to Twitter API.
'''
resp = api.request('tweets/search/recent', {'query': keyword, #
INSERT YOUR CODE
'max_results': total_count, #
INSERT YOUR CODE
'tweet.fields': {'lang':'en'},
'place.fields':{
'geo': {
"type": "Feature",
"bbox": [
longitude - delta_lon,
latitude - delta_lat,
longitude + delta_lon,
latitude + delta_lat
],
"properties": {}
}}})
# check first if there was an error
if ('errors' in resp.json()):
errors = resp.json()['title']
if (errors == 'Invalid Request'):
print('Too many attempts to load tweets or too many
tweets to load.')
print('You need to wait for a few minutes before
accessing Twitter API again or reduce max_results.')
if ('statuses' in resp.json()):
tweets_unfiltered += resp.json()['statuses']
tweets = [tweet for tweet in tweets_unfiltered if
((tweet['retweeted'] != True) and ('RT #' not in
tweet['text']))]
# find the max_id_str for the next batch
ids = [tweet['id'] for tweet in tweets_unfiltered]
max_id_str = str(min(ids))
# loop until as many tweets as total_count is collected
number_of_tweets = len(tweets)
while number_of_tweets < total_count:
resp = api.request('search/tweets', {'q': keyword,
'count': total_count,
'lang':'en',
'result_type': 'recent',
'max_id': max_id_str,
'geocode':'{},{},
{}mi'.format(latitude,
longitude, delta_lat, delta_lon)}) #INSERT YOUR CODE
if ('statuses' in resp.json()):
tweets_unfiltered += resp.json()['statuses']
tweets = [tweet for tweet in tweets_unfiltered if
((tweet['retweeted'] != True) and ('RT #' not in
tweet['text']))]
ids = [tweet['id'] for tweet in tweets_unfiltered]
max_id_str = str(min(ids))
number_of_tweets = len(tweets)
print("{} tweets are collected for keyword {}. Last tweet
created at {}".format(number_of_tweets,
keyword,
tweets[number_of_tweets-1]['created_at']))
ids = [int(tweet['id']) for tweet in tweets_unfiltered]
max_id_str = str(min(ids))
tweets = [tweet for tweet in tweets_unfiltered if (('RT #' not in
tweet['text']) & (tweet['lang'] == 'en'))]
# loop until as many tweets as total_count is collected
number_of_tweets = len(tweets)
while number_of_tweets < total_count:
resp = api.request('tweets/search/recent', {'query': keyword,
# INSERT YOUR CODE
'max_results':total_count,
'until_id': max_id_str,
'tweet.fields':
{'lang':'en'},
'place.fields':{
'geo': {
"type": "Feature",
"bbox": [
longitude - delta_lon,
latitude - delta_lat,
longitude + delta_lon,
latitude + delta_lat
],
"properties": {}
}}})
tweets_unfiltered += resp
tweets = [tweet for tweet in tweets_unfiltered if (('RT #'
not in tweet['text']) & (tweet['lang'] == 'en'))]
ids = [int(tweet['id']) for tweet in tweets_unfiltered]
max_id_str = str(min(ids))
number_of_tweets = len(tweets)
print("{} tweets are collected for keyword {}.
".format(number_of_tweets, keyword))
return tweets
k1_tweets = retrieve_tweets(api,'donkey',50,200,PLACE_LAT,PLACE_LON,
DELTA_LAT, DELTA_LON )
k2_tweets =
retrieve_tweets(api,'condition',50,200,PLACE_LAT,PLACE_LON,
DELTA_LAT, DELTA_LON)
k3_tweets = retrieve_tweets(api,'motion',50,200,PLACE_LAT,PLACE_LON,
DELTA_LAT, DELTA_LON)
I have edited the code to add the rest of it.
If someone knows what is causing tje error it would be great.
Related
I am trying to scrape tweets from a specified user based on a specific keyword using Tweepy. I tried using
if api.search(q="$"):
but I am running into an error. How can I solve this problem?
#Import the libraries
import tweepy
api_key = ""
api_key_secret = ""
access_token = ""
access_token_secret = ""
auth_handler = tweepy.OAuthHandler(consumer_key=api_key, consumer_secret=api_key_secret)
auth_handler.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth_handler,wait_on_rate_limit=True)
user = api.get_user("TheShual")
print("User details:")
print(user.name)
print(user.description)
print(user.location)
userID = "TheShual"
tweets = api.user_timeline(screen_name=userID,
# 200 is the maximum allowed count
count=20,
include_rts = False,
# Necessary to keep full_text
# otherwise only the first 140 words are extracted
tweet_mode = 'extended'
)
for info in tweets[:10]:
if api.search(q="$"):
print(info.created_at)
print(info.full_text)
print("\n")
I am trying to extract the all tweets which contain specific keyword and its geo locations .
for example , I want download all the tweets in english which contains the keyword 'iphone' from 'france' and 'singapore'
My code
import tweepy
import csv
import pandas as pd
import sys
# API credentials here
consumer_key = 'INSERT CONSUMER KEY HERE'
consumer_secret = 'INSERT CONSUMER SECRET HERE'
access_token = 'INSERT ACCESS TOKEN HERE'
access_token_secret = 'INSERT ACCESS TOKEN SECRET HERE'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
# Search word/hashtag value
HashValue = ""
# search start date value. the search will start from this date to the current date.
StartDate = ""
# getting the search word/hashtag and date range from user
HashValue = input("Enter the hashtag you want the tweets to be downloaded for: ")
StartDate = input("Enter the start date in this format yyyy-mm-dd: ")
# Open/Create a file to append data
csvFile = open(HashValue+'.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)
for tweet in tweepy.Cursor(api.search,q=HashValue,count=20,lang="en",since=StartDate, tweet_mode='extended').items():
print (tweet.created_at, tweet.full_text)
csvWriter.writerow([tweet.created_at, tweet.full_text.encode('utf-8')])
print ("Scraping finished and saved to "+HashValue+".csv")
#sys.exit()
How can this be done.
-Hello- Rahul
As I understand it you are looking to get geo data off searched tweets rather then filter search based on geocode.
Here is a code sample with the relevant fields you are interested in. These may or may not be provided depending on the tweeters privacy settings.
Note there is no "since" parameter on the search API:
https://tweepy.readthedocs.io/en/latest/api.html#help-methods
https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets
Standard twitter api search goes back 7 days. The premium and enterprise APIs have 30 day search as well as Full Archive search, but you will pay $$$.
Unfortunately tweepy still hasn't had their models documented:
https://github.com/tweepy/tweepy/issues/720
So if you want to look at the tweet object you can use pprint package and run:
pprint(tweet.__dict__)
One difference I noticed was the "text" field in the JSON became "full_text" in the object.
There's also information on the original tweet in there if the one you found was a quote tweet, has the same info from what I could see.
Anyway here's the code, I added a max tweet count for looping through the cursor while I was testing to avoid blowing any API limits.
Let me know if you want csv code but it looks like you can handle that already.
import tweepy
# API credentials here
consumer_key = 'your-info'
consumer_secret = 'your-info'
access_token = 'your-info'
access_token_secret = 'your-info'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
searchString = "iPhone"
cursor = tweepy.Cursor(api.search, q=searchString, count=20, lang="en", tweet_mode='extended')
maxCount = 1
count = 0
for tweet in cursor.items():
print()
print("Tweet Information")
print("================================")
print("Text: ", tweet.full_text)
print("Geo: ", tweet.geo)
print("Coordinates: ", tweet.coordinates)
print("Place: ", tweet.place)
print()
print("User Information")
print("================================")
print("Location: ", tweet.user.location)
print("Geo Enabled? ", tweet.user.geo_enabled)
count = count + 1
if count == maxCount:
break;
Will output something like this:
Tweet Information
================================
Text: NowPlaying : Hashfinger - Leaving
https://derp.com
#iPhone free app https://derp.com
#peripouwebradio
Geo: None
Coordinates: None
Place: None
User Information
================================
Location: Greece
Geo Enabled? True
Using python-twitter library (https://github.com/bear/python-twitter) I am trying to retrieve the 1000 most recent tweets for the top 10 trends in a location using the yahoo Where on earth ID but can't seem to figure out how to retrieve the tweets from the trends in the list.
I don't know if I'm missing something in the documentation and can't find any examples online of how to get trend specific tweets from the list returned and how to remove the retweets.
import twitter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
ACCESS_TOKEN = 'XXXX'
ACCESS_TOKEN_SECRET = 'XXXX'
CONSUMER_KEY = 'XXXX'
CONSUMER_SECRET = 'XXXX'
# Define the location id for the UK
WOEID = 23424975
# Define language of tweets
LANG = "en"
# Define type of tweets we are after
TWEETS_TYPE = "recent"
# Define max number of tweets per trend
MAX_STATUSES = 1000
# API config
# API with request rate limited
api = twitter.Api(CONSUMER_KEY,
CONSUMER_SECRET,
ACCESS_TOKEN,
ACCESS_TOKEN_SECRET,
sleep_on_rate_limit=True)
print(api.VerifyCredentials())
# Query the Twitter API for the current top 10 trends in the UK.
uk_trends = api.GetTrendsWoeid(WOEID)
print(uk_trends)
# Return the 1000 most recent tweets for each trend
# This is where I want to retrieve the tweets per trend
for trend in uk_trends:
count = MAX_STATUSES
trend = trend
search_results = api.GetSearch(term=trend, count=count)
print(search_results)
Should I be using the twitter API itself instead?
I was able to solve the problem by accessing the name attribute of the Trend() object, parsing it to a string and iterating through the results returned in my code as follows:
```
import os
import json
import csv
import twitter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
ACCESS_TOKEN = 'XXXX'
ACCESS_TOKEN_SECRET = 'XXXX'
CONSUMER_KEY = 'XXXX'
CONSUMER_SECRET = 'XXXX'
# Define the location id for the UK
# With Yahoo Where on Earth ID
WOEID = 23424975
# Define language of tweets
LANG = "en"
# Define type of tweets we are after
TWEETS_TYPE = "recent"
# Define max number of tweets per trend
MAX_STATUSES = 10
# API configuration
# API with request rate limited
api = twitter.Api(CONSUMER_KEY,
CONSUMER_SECRET,
ACCESS_TOKEN,
ACCESS_TOKEN_SECRET,
sleep_on_rate_limit=True)
# Check twitter account api details are correct
# print(api.VerifyCredentials())
# Query the Twitter API for the current top 10 trends in the UK.
uk_trends = api.GetTrendsWoeid(WOEID)
print(uk_trends)
# Return the 1000 most recent tweets for each trend
for trend in uk_trends:
'''
Extract name of each trend returned by Trend model
and search required count value of recent tweets per trend
'''
trend = str(trend.name)
count = MAX_STATUSES
search_results = api.GetSearch(term=trend, count=count)
print(search_results)
```
I am quite new to Twitter API and Tweepy and I am confused with the rate-limiting concept, I am using the streaming API and I want to gather sample tweets without using any filters such as hashtags or location, some sources state I should not get rate limited with sample tweets as I am getting 1% of tweets and some state otherwise. I keep getting error 420 very often and I was wondering if there is a way to avoid it or make it smoother?
Thank you so much for your help
My code:
import json
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from textblob import TextBlob
from elasticsearch import Elasticsearch
from datetime import datetime
# import twitter keys and tokens
from config import *
# create instance of elasticsearch
es = Elasticsearch()
indexName = "test_new_fields"
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
class TweetStreamListener(StreamListener):
hashtags = []
# on success
def on_data(self, data):
# decode json
dict_data = json.loads(data) # data is a json string
# print(data) # to print the twitter json string
print(dict_data)
# pass tweet into TextBlob
tweet = TextBlob(dict_data["text"])
# determine if sentiment is positive, negative, or neutral
if tweet.sentiment.polarity < 0:
sentiment = "negative"
elif tweet.sentiment.polarity == 0:
sentiment = "neutral"
else:
sentiment = "positive"
# output polarity sentiment and tweet text
print (str(tweet.sentiment.polarity) + " " + sentiment + " " + dict_data["text"])
try:
#check if there r any hashtags
if len(dict_data["entities"]["hashtags"]) != 0:
hashtags = dict_data["entities"]["hashtags"]
#if no hashtags add empty
else:
hashtags= []
except:
pass
es.indices.put_settings(index=indexName, body={"index.blocks.write":False})
# add text and sentiment info to elasticsearch
es.index(index=indexName,
doc_type="test-type",
body={"author": dict_data["user"]["screen_name"],
"date": dict_data["created_at"], # unfortunately this gets stored as a string
"location": dict_data["user"]["location"], # user location
"followers": dict_data["user"]["followers_count"],
"friends": dict_data["user"]["friends_count"],
"time_zone": dict_data["user"]["time_zone"],
"lang": dict_data["user"]["lang"],
#"timestamp": float(dict_data["timestamp_ms"]), # double not recognised as date
"timestamp": dict_data["timestamp_ms"],
"datetime": datetime.now(),
"message": dict_data["text"],
"hashtags": hashtags,
"polarity": tweet.sentiment.polarity,
"subjectivity": tweet.sentiment.subjectivity,
# handle geo data
#"coordinates": dict_data[coordinates],
"sentiment": sentiment})
return True
# on failure
def on_error(self, error):
print "error: " + str(error)
if __name__ == '__main__':
# create instance of the tweepy tweet stream listener
listener = TweetStreamListener()
# set twitter keys/tokens
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
while True:
try:
#create instance of the tweepy stream
stream = Stream(auth, listener)
# search twitter for sample tweets
stream.sample()
except KeyError:
pass
Ok, I have found the solution for this problem, changing the method from on_data to on_status removed all the issues causing the error 420.
This is driving me crazy. As you can see below I am trying to use a simple while loop to perform a couple of tweepy searches and append them into a data frame. For some reason however after pulling the first set of 100 tweets it just repeats that set instead of performing a new search. Any advice would be greatly appreciated.
import sys
import csv
import pandas as pd
import tweepy
from tweepy import OAuthHandler
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
num_results = 200
result_count = 0
last_id = None
df = pd.DataFrame(columns=['Name', 'Location', 'Followers', 'Text', 'Coorinates'])
while result_count < num_results:
result = api.search(q='',count=100, geocode= "38.996918,-104.995826,190mi", since_id = last_id)
for tweet in result:
user = tweet.user
last_id = tweet.id_str
name = user.name
friends = user.friends_count
followers = user.followers_count
text = tweet.text.encode('utf-8')
location = user.location
coordinates = tweet.coordinates
df.loc[result_count] = pd.Series({'Name':name, 'Location':location, 'Followers':followers, 'Text':text, 'Coordinates':coordinates})
print(text)
result_count += 1
# Save to Excel
print("Writing all tables to Excel...")
df.to_csv('out.csv')
print("Excel Export Complete.")
The API.search method returns tweets that match a specified query. It's not a Streaming APi, so it returns all data at once.
Furthermore, in your query parameters, you have added count, that specifies the number of statuses to retrieve.
So the problem is that with your query you are returning the first 100 data of the complete set for each while iteration.
I suggest you to change the code in something like this
result = api.search(q='', geocode= "38.996918,-104.995826,190mi", since_id = last_id)
for tweet in result:
user = tweet.user
last_id = tweet.id_str
name = user.name
friends = user.friends_count
followers = user.followers_count
text = tweet.text.encode('utf-8')
location = user.location
coordinates = tweet.coordinates
df.loc[result_count] = pd.Series({'Name':name, 'Location':location, 'Followers':followers, 'Text':text, 'Coordinates':coordinates})
print(text)
Let me know.