I'm trying to get the follower count of companies and track it over time. I have over 200 000 companies so the code I currently have would literally take years to run with current api limit.
c = tweepy.Cursor(api.followers_ids, id = a)
ids = []
for id in c.items():
time.sleep(0.01)
ids.append(id) '
In this code its one api hit for every follower. I was wondering if there is a function that just gives a follower count as a number? Also what is the twitter api limit?
Each API requests returns at most 5000 followers IDs at a time, to retrieve all the followers of the 200 000 companies, here is a very useful script from the book Mining the social web by Matthew A. Russell to solve the twitter api limit
to make robust twitter request and to access twitter's API Matthew defined these methods :
import sys
import time
from urllib2 import URLError
from httplib import BadStatusLine
import json
import twitter
def oauth_login():
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
return twitter_api
def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw):
# A nested helper function that handles common HTTPErrors. Return an updated
# value for wait_period if the problem is a 500 level error. Block until the
# rate limit is reset if it's a rate limiting issue (429 error). Returns None
# for 401 and 404 errors, which requires special handling by the caller.
def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):
if wait_period > 3600: # Seconds
print >> sys.stderr, 'Too many retries. Quitting.'
raise e
# See https://dev.twitter.com/docs/error-codes-responses for common codes
if e.e.code == 401:
print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'
return None
elif e.e.code == 404:
print >> sys.stderr, 'Encountered 404 Error (Not Found)'
return None
elif e.e.code == 429:
print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)'
if sleep_when_rate_limited:
print >> sys.stderr, "Retrying in 15 minutes...ZzZ..."
sys.stderr.flush()
time.sleep(60*15 + 5)
print >> sys.stderr, '...ZzZ...Awake now and trying again.'
return 2
else:
raise e # Caller must handle the rate limiting issue
elif e.e.code in (500, 502, 503, 504):
print >> sys.stderr, 'Encountered %iError. Retrying in %iseconds' %\
(e.e.code, wait_period)
time.sleep(wait_period)
wait_period *= 1.5
return wait_period
else:
raise e
# End of nested helper function
wait_period = 2
error_count = 0
while True:
try:
return twitter_api_func(*args, **kw)
except twitter.api.TwitterHTTPError, e:
error_count = 0
wait_period = handle_twitter_http_error(e, wait_period)
if wait_period is None:
return
except URLError, e:
error_count += 1
print >> sys.stderr, "URLError encountered. Continuing."
if error_count > max_errors:
print >> sys.stderr, "Too many consecutive errors...bailing out."
raise
except BadStatusLine, e:
error_count += 1
print >> sys.stderr, "BadStatusLine encountered. Continuing."
if error_count > max_errors:
print >> sys.stderr, "Too many consecutive errors...bailing out."
raise
here is the methods to retrieve the friends and the followers :
from functools import partial
from sys import maxint
def get_friends_followers_ids(twitter_api, screen_name=None, user_id=None,
friends_limit=maxint, followers_limit=maxint):
# Must have either screen_name or user_id (logical xor)
assert (screen_name != None) != (user_id != None),\
"Must have screen_name or user_id, but not both"
# See https://dev.twitter.com/docs/api/1.1/get/friends/ids and
# https://dev.twitter.com/docs/api/1.1/get/followers/ids for details
# on API parameters
get_friends_ids = partial(make_twitter_request, twitter_api.friends.ids,
count=5000)
get_followers_ids = partial(make_twitter_request,twitter_api.followers.ids,
count=5000)
friends_ids, followers_ids = [], []
for twitter_api_func, limit, ids, label in [
[get_friends_ids, friends_limit, friends_ids, "friends"],
[get_followers_ids, followers_limit, followers_ids, "followers"]
]:
if limit == 0: continue
cursor = -1
while cursor != 0:
# Use make_twitter_request via the partially bound callable...
if screen_name:
response = twitter_api_func(screen_name=screen_name, cursor=cursor)
else: # user_id
response = twitter_api_func(user_id=user_id, cursor=cursor)
if response is not None:
ids += response['ids']
cursor = response['next_cursor']
print >> sys.stderr, 'Fetched {0} total {1} ids for{2}'.format(len(ids),
label, (user_id or screen_name))
# XXX: You may want to store data during each iteration to provide
# an additional layer of protection from exceptional circumstances
if len(ids) >= limit or response is None:
break
# Do something useful with the IDs, like store them to disk...
return friends_ids[:friends_limit], followers_ids[:followers_limit]
# Sample usage
twitter_api = oauth_login()
friends_ids, followers_ids =get_friends_followers_ids(twitter_api,
screen_name="SocialWebMining",
friends_limit=10,
followers_limit=10)
print friends_ids
print followers_ids
Related
I'm trying to get some data from a web page. To speed up this process (they allow me to make 1000 requests per minute), I use ThreadPool.
Since there is a huge amount of data, the process is quite vulnerable to connection fails etc. so I try to log everything I can to be able to detect each mistake I did in code.
The problem is that program sometimes just stops without any exception (it acts like it is running but with no effect - I use PyCharm). I log catched exceptions everywhere I can but I can't see any exception in any log.
I assume that if there were a timeout reached, the exception would be raised and logged.
I've found out where the problem could be. Here is the code:
As a pool, I use: from multiprocessing.pool import ThreadPool as Pool
And lock: from threading import Lock
The download_category function is being used in loop.
def download_category(url):
# some code
#
# ...
log('Create pool...')
_pool = Pool(_workers_number)
with open('database/temp_produkty.txt') as f:
log('Spracovavanie produktov... vytvaranie vlakien...') # I see this in log
for url_product in f:
x = _pool.apply_async(process_product, args=(url_product.strip('\n'), url))
_pool.close()
_pool.join()
log('Presuvanie produktov z temp export do export.csv...') # I can't see this in log
temp_export_to_export_csv()
set_spracovanie_kategorie(url)
except Exception as e:
logging.exception('Got exception on download_one_category: {}'.format(url))
And process_product function:
def process_product(url, cat):
try:
data = get_product_data(url)
except:
log('{}: {} exception while getting product data... #') # I don't see this in log
return
try:
print_to_temp_export(data, cat) # I don't see this in log
except:
log('{}: {} exception while printing to csv... #') # I don't see this in log
raise
LOG function:
def log(text):
now = datetime.now().strftime('%d.%m.%Y %H:%M:%S')
_lock.acquire()
mLib.printToFile('logging/log.log', '{} -> {}'.format(now, text))
_lock.release()
I use logging module too. In this log, I see that probably 8 (number of workers) times request was sent but no answer hasn't been recieved.
EDIT1:
def get_product_data(url):
data = defaultdict(lambda: '-')
root = load_root(url)
try:
nazov = root.xpath('//h1[#itemprop="name"]/text()')[0]
except:
nazov = root.xpath('//h1/text()')[0]
under_block = root.xpath('//h2[#id="lowest-cost"]')
if len(under_block) < 1:
under_block = root.xpath('//h2[contains(text(),"Naj")]')
if len(under_block) < 1:
return False
data['nazov'] = nazov
data['url'] = url
blocks = under_block[0].xpath('./following-sibling::div[#class="shp"]/div[contains(#class,"shp")]')
i = 0
for block in blocks:
i += 1
data['dat{}_men'.format(i)] = eblock.xpath('.//a[#class="link"]/text()')[0]
del root
return data
LOAD ROOT:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url, allow_redirects=False)
if r.status_code == 301:
raise RedirectException
if r.status_code == 404:
if '-q-' in url:
url = url.replace('-q-','-')
mLib.printToFileWOEncoding('logging/neexistujuce.txt','Skusanie {} kategorie...'.format(url))
return load_url(url) # THIS IS NOT LOOPING
else:
mLib.printToFileWOEncoding('logging/neexistujuce.txt','{}'.format(url))
html = r.text
return html
def load_root(url):
try:
html = load_url(url)
except Exception as e:
logging.exception('load_root_exception')
raise
return etree.fromstring(html, etree.HTMLParser())
I am getting the following error from a batch request to the GMail API on Google App Engine:
RequestTooLargeError: The request to API call datastore_v3.Put() was too large.
From other questions on Stackoverflow I understand that the problem has to do with memcache. Nevertheless I don't know how to solve this issue, since it is coming up even if I run 1 request per batch and it is before I can actually do something with the content of the email (like compressing it).
My code currently looks as follows:
count = 0 #start a new batch request after every 1000 requests
batch = BatchHttpRequest(callback=get_items)
for i in new_items:
batch.add(service.users().messages().get(userId=email, id=i), request_id=str(count))
count += 1
if count % 1000 == 0:
for n in range(0, 5):
try:
batch.execute(http=http)
break
except Exception as e:
if n < 4:
time.sleep((2 ** n) + random.randint(0, 1000) / 1000)
else:
raise
batch = BatchHttpRequest(callback=get_items)
if count % 1000 != 0:
for n in range(0, 5):
try:
batch.execute(http=http)
break
except Exception as e:
if n < 4:
time.sleep((2 ** n) + random.randint(0, 1000) / 1000)
else:
raise
What would be a possible solution?
EDIT
Adding callback function
def get_items(request_id, response, exception):
if exception is not None:
print 'An error occurred: %s' % exception
else:
save_messages = request_id.split('/', 2)[1]
email = request_id.split('/', 2)[2]
in_reply_to = ''
m_id = ''
for r in response['payload']['headers']:
if r['name'].lower() == 'message-id':
m_id = r.get('value')
elif r['name'].lower() == 'in-reply-to':
in_reply_to = r.get('value')
elif r['name'].lower() == 'from':
sender, t_t = stripEmails(r.get('value'), None, None, True, email, False, False)
if m_id:
incoming = Gmail(id=m_id) #skip if exists already
else:
logging.info(response)
logging.exception('No message ID detect')
return
incoming.email = email
incoming.response = json.dumps(response)
incoming.put()
By savings the too large text item as JsonProperty with compressed=true the problem was solved.
I'm playing around the Twitter API and am in the process of developing a script to pull all Tweets with a certain hashtag down to a local mongoDB. I have it working fine when I'm downloading tweets from users, but when downloading tweets from a hashtag I get:
return loads(fp.read(),
AttributeError: 'int' object has no attribute 'read'
Can anyone offer their infinite wisdom into how I could get this script to work?
To run, save it as a .py file, cd to the folder and run:
python twitter.py
Code:
__author__ = 'Tom Cusack'
import pymongo
import oauth2 as oauth
import urllib2, json
import sys, argparse, time
def oauth_header(url, consumer, token):
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time()),
}
req = oauth.Request(method = 'GET',url = url, parameters = params)
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(),consumer, token)
return req.to_header()['Authorization'].encode('utf-8')
def main():
### Twitter Settings
numtweets = '32000'
verbose = 'store_true'
retweet = 'store_false'
CONSUMER_KEY = 'M7Xu9Wte0eIZvqhb4G9HnIn3G'
CONSUMER_SECRET = 'c8hB4Qwps2aODQUx7UsyzQuCRifEp3PKu6hPQll8wnJGIhbKgZ'
ACCESS_TOKEN = '3213221313-APuXuNjVMbRbZpu6sVbETbgqkponGsZJVT53QmG'
ACCESS_SECRET = 'BJHrqWC9ed3pA5oDstSMCYcUcz2pYF3DmJ7jcuDe7yxvi'
base_url = url = 'https://api.twitter.com/1.1/search/tweets.json?include_entities=true&count=200&q=#mongodb&include_rts=%s' % (retweet)
oauth_consumer = oauth.Consumer(key = CONSUMER_KEY, secret = CONSUMER_SECRET)
oauth_token = oauth.Token(key = ACCESS_TOKEN, secret = ACCESS_SECRET)
### Mongodb Settings
uri = 'mongodb://127.0.0.1:27017/SARKY'
if uri != None:
try:
conn = pymongo.MongoClient(uri)
print 'Pulling Tweets..'
except:
print 'Error: Unable to connect to DB. Check uri variable.'
return
uri_parts = pymongo.uri_parser.parse_uri(uri)
db = conn[uri_parts['database']]
db['twitter-harvest'].ensure_index('id_str')
### Helper Variables for Harvest
max_id = -1
tweet_count = 0
stream = 0
### Begin Harvesting
while True:
auth = oauth_header(url, oauth_consumer, oauth_token)
headers = {"Authorization": auth}
request = urllib2.Request(url, headers = headers)
try:
stream = urllib2.urlopen(request)
except urllib2.HTTPError, err:
if err.code == 404:
print 'Error: Unknown user. Check --user arg'
return
if err.code == 401:
print 'Error: Unauthorized. Check Twitter credentials'
return
tweet_list = json.load(stream)
if len(tweet_list) == 0:
print 'No tweets to harvest!'
return
if 'errors' in tweet_list:
print 'Hit rate limit, code: %s, message: %s' % (tweets['errors']['code'], tweets['errors']['message'])
return
if max_id == -1:
tweets = tweet_list
else:
tweets = tweet_list[1:]
if len(tweets) == 0:
print 'Finished Harvest!'
return
for tweet in tweets:
max_id = id_str = tweet['id_str']
try:
if tweet_count == numtweets:
print 'Finished Harvest- hit numtweets!'
return
if uri != None:
db[user].update({'id_str':id_str},tweet,upsert = True)
else:
print tweet['text']
tweet_count+=1
if verbose == True and uri != None:
print tweet['text']
except Exception, err:
print 'Unexpected error encountered: %s' %(err)
return
url = base_url + '&max_id=' + max_id
if __name__ == '__main__':
try:
main()
except SystemExit as e:
if e.code == 0:
pass
You initially set stream = 0. When your try...except block catches a HTTP response with a code that isn't 404 or 401, stream is still equal to 0, but your except block doesn't break out of the function.
I'd look more closely at what this response says.
I am a Python newbie and am trying to print error messages when using Tweepy to stream tweets. I used an endless loop in my streaming code because it generates InComplete Read errors otherwise. My aim is to print all the error messages I get while continuing to stream tweets, so that I am aware of errors other than the InComplete Read errors.
My streamListerner is:
# Code from http://badhessian.org/2012/10/collecting-real-time-twitter-data- with-the-streaming-api/ with minor modifications
import json, time, sys
from tweepy import StreamListener
# create an instance of a tweepy StreamListener to handle the incoming data.
class SListener(StreamListener):
def __init__(self, fprefix = 'streamer'):
# self.api = api or API()
self.counter = 0
self.fprefix = fprefix
self.output = open('../Dissertation/stream_3_data/' + fprefix + '.' + time.strftime('%Y%m%d-%H%M%S') + '.json', 'w')
self.delout = open('delete.txt', 'a')
def on_data(self, data):
if 'in_reply_to_status' in data:
self.on_status(data)
elif 'delete' in data:
delete = json.loads(data)['delete']['status']
if self.on_delete(delete['id'], delete['user_id']) is False:
return False
elif 'limit' in data:
if self.on_limit(json.loads(data)['limit']['track']) is False:
return False
elif 'warning' in data:
warning = json.loads(data)['warnings']
print warning['message']
return False
def on_status(self, status):
self.output.write(status)
self.counter += 1
if self.counter >= 5000: # New file is started every 5,000 tweets, tagged with prefix and a timestamp.
self.output.close()
self.output = open('../Dissertation/stream_3_data/' + self.fprefix + '.'
+ time.strftime('%Y%m%d-%H%M%S') + '.json', 'w')
self.counter = 0
return
def on_delete(self, status_id, user_id):
self.delout.write( str(status_id) + "\n")
return
def on_limit(self, track):
sys.stderr.write(track + "\n")
return
def on_error(self, status_code):
sys.stderr.write('Error: ' + str(status_code) + "\n")
return True # Don't kill the stream
def on_timeout(self):
sys.stderr.write("Timeout, sleeping for 60 seconds...\n")
time.sleep(60)
return True # Don't kill the stream
The part that seems to generate problems is when I try to use the streamlistener:
twitter_api = tweepy_oauth()
Q = "twitter.com"
locations = [101.615161,3.08115,101.753663,3.167507,
115.421372,39.43277,117.501099,41.05999,
120.858322,30.69094,121.9733,31.86889]
# Create a streaming API and set a timeout value of 60 seconds.
streaming_api = tweepy.streaming.Stream(twitter_api, SListener(), timeout=60)
# Used infinite loop from https://github.com/ryanmcgrath/twython/issues/288 cause
# I kept getting InComplete Read Error. Probably due to high volumes of tweets being sent to me at once
#Endless loop
while True:
try:
streaming_api.filter(follow=None, track=None, locations=locations, stall_warnings=True)
except:
e = sys.exc_info()[0] #Get exception info
print 'ERROR:',e #Print exception info
continue
My code does run and works, but I encounter the following error occasionally, which stops my entire stream:
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
<ipython-input-4-fb45fa5d8307> in <module>()
34 streaming_api.filter(follow=None, track=None, locations=locations, stall_warnings=True)
35 except:
36 e = sys.exc_info()[0] #Get exception info
---> 37 print 'ERROR:',e #Print exception info
38 continue
IOError: [Errno 22] Invalid argument
The timing when the error appears is inconsistent - it ranges from 1h into the stream to an entire day into the stream.
I concluded that the issue is with the print statement because I replaced line 37 with
print 'Error'
and the same error message appears. I am not sure how to proceed when even the basic print statement does not work - any help would be great.
I am trying to get tweets related to the keyword in the code But at the python shell there is nothing its just curson only No traceback nothing.The code is here
import time
import pycurl
import urllib
import json
import oauth2 as oauth
API_ENDPOINT_URL = 'https://stream.twitter.com/1.1/statuses/filter.json'
USER_AGENT = 'TwitterStream 1.0' # This can be anything really
# You need to replace these with your own values
OAUTH_KEYS = {'consumer_key': 'ABC',
'consumer_secret': 'ABC',
'access_token_key': 'ABC',
'access_token_secret': 'ABC'}
# These values are posted when setting up the connection
POST_PARAMS = {'include_entities': 0,
'stall_warning': 'true',
'track': 'iphone,ipad,ipod'}
class TwitterStream:
def __init__(self, timeout=False):
self.oauth_token = oauth.Token(key=OAUTH_KEYS['access_token_key'], secret=OAUTH_KEYS['access_token_secret'])
self.oauth_consumer = oauth.Consumer(key=OAUTH_KEYS['consumer_key'], secret=OAUTH_KEYS['consumer_secret'])
self.conn = None
self.buffer = ''
self.timeout = timeout
self.setup_connection()
def setup_connection(self):
""" Create persistant HTTP connection to Streaming API endpoint using cURL.
"""
if self.conn:
self.conn.close()
self.buffer = ''
self.conn = pycurl.Curl()
# Restart connection if less than 1 byte/s is received during "timeout" seconds
if isinstance(self.timeout, int):
self.conn.setopt(pycurl.LOW_SPEED_LIMIT, 1)
self.conn.setopt(pycurl.LOW_SPEED_TIME, self.timeout)
self.conn.setopt(pycurl.URL, API_ENDPOINT_URL)
self.conn.setopt(pycurl.USERAGENT, USER_AGENT)
# Using gzip is optional but saves us bandwidth.
self.conn.setopt(pycurl.ENCODING, 'deflate, gzip')
self.conn.setopt(pycurl.POST, 1)
self.conn.setopt(pycurl.POSTFIELDS, urllib.urlencode(POST_PARAMS))
self.conn.setopt(pycurl.HTTPHEADER, ['Host: stream.twitter.com',
'Authorization: %s' % self.get_oauth_header()])
# self.handle_tweet is the method that are called when new tweets arrive
self.conn.setopt(pycurl.WRITEFUNCTION, self.handle_tweet)
def get_oauth_header(self):
""" Create and return OAuth header.
"""
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time())}
req = oauth.Request(method='POST', parameters=params, url='%s?%s' % (API_ENDPOINT_URL,
urllib.urlencode(POST_PARAMS)))
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), self.oauth_consumer, self.oauth_token)
return req.to_header()['Authorization'].encode('utf-8')
def start(self):
""" Start listening to Streaming endpoint.
Handle exceptions according to Twitter's recommendations.
"""
backoff_network_error = 0.25
backoff_http_error = 5
backoff_rate_limit = 60
while True:
self.setup_connection()
try:
self.conn.perform()
except:
# Network error, use linear back off up to 16 seconds
print 'Network error: %s' % self.conn.errstr()
print 'Waiting %s seconds before trying again' % backoff_network_error
time.sleep(backoff_network_error)
backoff_network_error = min(backoff_network_error + 1, 16)
continue
# HTTP Error
sc = self.conn.getinfo(pycurl.HTTP_CODE)
if sc == 420:
# Rate limit, use exponential back off starting with 1 minute and double each attempt
print 'Rate limit, waiting %s seconds' % backoff_rate_limit
time.sleep(backoff_rate_limit)
backoff_rate_limit *= 2
else:
# HTTP error, use exponential back off up to 320 seconds
print 'HTTP error %s, %s' % (sc, self.conn.errstr())
print 'Waiting %s seconds' % backoff_http_error
time.sleep(backoff_http_error)
backoff_http_error = min(backoff_http_error * 2, 320)
def handle_tweet(self, data):
""" This method is called when data is received through Streaming endpoint.
"""
self.buffer += data
if data.endswith('\r\n') and self.buffer.strip():
# complete message received
message = json.loads(self.buffer)
self.buffer = ''
msg = ''
if message.get('limit'):
print 'Rate limiting caused us to miss %s tweets' % (message['limit'].get('track'))
elif message.get('disconnect'):
raise Exception('Got disconnect: %s' % message['disconnect'].get('reason'))
elif message.get('warning'):
print 'Got warning: %s' % message['warning'].get('message')
else:
print 'Got tweet with text: %s' % message.get('text')
if __name__ == '__main__':
ts = TwitterStream()
ts.setup_connection()
ts.start()
please help me to resolve the issue with code