I have a python script to get all the followers and friends on Twitter. I use supervisor to manage the process of the script. One thing I notice is that supervisor will restart the script as it sleeps to wait for the rate limit of Twitter to clear up. How do I stop that?
This is my script.
#!/usr/bin/env python
import pymongo
import tweepy
from pymongo import MongoClient
from sweepy.get_config import get_config
config = get_config()
consumer_key = config.get('PROCESS_TWITTER_CONSUMER_KEY')
consumer_secret = config.get('PROCESS_TWITTER_CONSUMER_SECRET')
access_token = config.get('PROCESS_TWITTER_ACCESS_TOKEN')
access_token_secret = config.get('PROCESS_TWITTER_ACCESS_TOKEN_SECRET')
MONGO_URL = config.get('MONGO_URL')
MONGO_PORT = config.get('MONGO_PORT')
MONGO_USERNAME = config.get('MONGO_USERNAME')
MONGO_PASSWORD = config.get('MONGO_PASSWORD')
client = MongoClient(MONGO_URL, int(MONGO_PORT))
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, retry_count=3)
db = client.tweets
db.authenticate(MONGO_USERNAME, MONGO_PASSWORD)
raw_tweets = db.raw_tweets
users = db.users
def is_user_in_db(user_id):
return get_user_from_db(user_id) is None
def get_user_from_db(user_id):
return users.find_one({'user.id' : user_id})
def get_user_from_twitter(user_id):
return api.get_user(user_id)
def get_followers(user_id):
users = []
for i, page in enumerate(tweepy.Cursor(api.followers, id=user_id, count=200).pages()):
print 'Getting page {} for followers'.format(i)
users += page
return users
def get_friends(user_id):
users = []
for i, page in enumerate(tweepy.Cursor(api.friends, id=user_id, count=200).pages()):
print 'Getting page {} for friends'.format(i)
users += page
return users
def get_followers_ids(user_id):
ids = []
for i, page in enumerate(tweepy.Cursor(api.followers_ids, id=user_id, count=5000).pages()):
print 'Getting page {} for followers ids'.format(i)
ids += page
return ids
def get_friends_ids(user_id):
ids = []
for i, page in enumerate(tweepy.Cursor(api.friends_ids, id=user_id, count=5000).pages()):
print 'Getting page {} for friends ids'.format(i)
ids += page
return ids
def process_user(user):
user_id = user['id']
screen_name = user['screen_name']
print 'Processing user : {}'.format(screen_name)
the_user = get_user_from_db(user_id)
if the_user is None:
user['followers_ids'] = get_followers_ids(screen_name)
user['friends_ids'] = get_friends_ids(screen_name)
users.insert_one(user)
if __name__ == "__main__":
for doc in raw_tweets.find({'processed' : {'$exists': False}}):
print 'Start processing'
try:
process_user(doc['user'])
except KeyError:
pass
try:
process_user(doc['retweeted_status']['user'])
except KeyError:
pass
raw_tweets.update_one({'_id': doc['_id']}, {'$set':{'processed':True}})
When rate limit is hit Tweepy will sleep in and wait for it. I would get this message.
Rate limit reached. Sleeping for: 896
However, supervisor somehow restart the script and run it again so the script will never finish. How do I stop that?
This is my supervisor configuration.
[program:twitter_processer]
command=/usr/bin/python -u /home/ubuntu/processer.py
directory=/home/ubuntu
autostart=true
autorestart=true
startretries=3
stderr_logfile=/home/ubuntu/processer.err.log
stdout_logfile=/home/ubuntu/processer.out.log
user=ubuntu
Related
I am working on the finishing touches on my tweepy twitter bot for my LED. I have been testing it with different accounts, and something weird is going on. With my personal account I tweet and it does nothing. My filter doesn't detect the hashtags and it just ignores everything. My friends have been able to use it, so I decided to log onto a separate twitter account. I can tweet there and my bot sees it and recognizes it.
I really have no clue what could cause this. Is it a problem with my bot, or is it with the accounts?
import tweepy
import requests
import json
consumer_key = 'nein'
consumer_secret = 'das'
access_token = 'ist'
access_token_secret = 'böse'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
user = api.me()
print(user.name)
counter = 0
class LEDStreamListener(tweepy.StreamListener):
def on_data(self, raw_data):
with open("tweets.json", "w") as write_file:
write_file.write(raw_data)
data = json.loads(raw_data)
variable_checker(data)
def on_error(self, status_code):
if status_code == 420:
print('Stream disconnect, because of rate limit error')
return False
else:
print('Unknown Error ' + status_code)
return False
def retweet_tweet(tweet_id):
api.retweet(tweet_id)
api.create_favorite(tweet_id)
def tag_checker(tag_list):
for i in range(0, len(tag_list)):
iterated_tag = tag_list[i]['text']
if iterated_tag == 'HUNTER_LED_ON':
return iterated_tag
elif iterated_tag == 'HUNTER_LED_OFF':
return iterated_tag
elif iterated_tag == 'led_test':
return iterated_tag
return ' '
def variable_checker(json_file):
if 'delete' in json_file:
# If the tweet was deleted do nothing
print('Delete')
else:
usr = json_file['user']['screen_name']
tweet_id = json_file['id_str']
print(tweet_id)
text = json_file['text']
tag_list = json_file['entities']['hashtags']
tag = tag_checker(tag_list)
data_check(usr, tweet_id, tag, text)
def data_check(twitter_user, tweet, tag, text):
if tag == 'HUNTER_LED_OFF' and not text.startswith('RT'):
requests.get('http://192.168.1.175/off')
retweet_tweet(tweet)
api.update_status('I turned the led off for you', tweet)
print('off')
return
elif tag == 'HUNTER_LED_ON' and not text.startswith('RT'):
requests.get('http://192.168.1.175/on')
retweet_tweet(tweet)
api.update_status('I turned the led on for you', tweet)
print('on')
return
elif tag == 'led_test' and not text.startswith('RT'):
retweet_tweet(tweet)
reply = 'Nice test bro *highfives* keep up the good work'
api.update_status('#%s %s' % (twitter_user, reply), in_reply_to_status_id=tweet)
print('tested')
return
elif twitter_user == 'realDonaldTrump':
api.create_favorite(tweet)
requests.get('http://192.168.1.175/trump')
print('Make America Great Again!')
return
else:
return
ledStreamListener = LEDStreamListener()
ledStream = tweepy.Stream(auth=api.auth, listener=ledStreamListener)
ledStream.filter(track=['#HUNTER_LED_OFF', '#HUNTER_LED_ON', '#led_test'])
It could have something to do with Twitter only making available about 1% of tweets for streaming to most users. Twitter does offer a "Firehose" account that gives you everything but it's rather expensive.
You could poll your users timeline every few seconds rather than using the streaming API.
new_tweets = api.user_timeline(user_id = user_id,count= 1)
I am a beginner writing a small twitter tool for scheduled tweets and automatic retweets in python/flask.
I got stuck with issues of processes running in the background.
I want scheduled tweets and retweets to work simultaneously in the background for a given user.
I want to be able to terminate these background processes running retweets/scheduled tweets separately from each other.
How would you change the code below to achieve this?
If you look at the code below now, it works, but user can not run scheduled tweets and retweets simultaneously. Also if user decides to terminate one of the processes, let us say retweets the other process terminates as well (scheduled tweets) and vice versa.
I thought about putting the identification data for a given process into a database and recalling this identification data from the database when there is a need to terminate it, instead of using cookies session, but I do not know how to implement this idea in code.
import ........
mysql = MySQL()
app = Flask(__name__)
app.secret_key = 'xxx'
app.config['MYSQL_DATABASE_USER'] = 'xxx'
app.config['MYSQL_DATABASE_PASSWORD'] = 'xxx'
app.config['MYSQL_DATABASE_DB'] = 'xxx'
app.config['MYSQL_DATABASE_HOST'] = '0.0.0.0'
mysql.init_app(app)
#app.route('/showSignin')
def showSignin():
if session.get('user'):
return redirect('/userHome')
else:
return render_template('signin.html')
#app.route('/showscheduletweets')
def showscheduletweets():
if session.get('user'):
return render_template('scheduletweets.html')
else:
return render_template('signin.html')
#app.route('/validateLogin',methods=['POST'])
def validateLogin():
try:
_username = request.form['inputEmail']
_password = request.form['inputPassword']
# connect to mysql
con = mysql.connect()
cursor = con.cursor()
cursor.callproc('sp_validateLogin',(_username,))
data = cursor.fetchall()
if len(data) > 0:
if check_password_hash(str(data[0][3]),_password):
session['user'] = data[0][0]
consumerkey = data [0][4]
consumersecret = data [0][5]
accesstoken = data [0][6]
tokensecret = data [0][7]
twitter = Twython(consumerkey, consumersecret, accesstoken, tokensecret)
twitter.update_status(status="xxx says hello.")
return render_template('userHome.html')
else:
return render_template('error.html',error = 'Wrong Email address or Password.')
else:
return render_template('error.html',error = 'Wrong Email address or Password.')
except Exception as e:
return render_template('error.html',error = str(e))
finally:
cursor.close()
con.close()
#schedule tweets
#app.route('/scheduletweets',methods=['POST'])
def scheduletweets():
if session.get('user'):
_username = request.form['inputEmail']
con = mysql.connect()
cursor = con.cursor()
cursor.callproc('sp_GetTwitter', (_username,))
data = cursor.fetchall()
session['user'] = data[0][0]
consumerkey = data [0][4]
consumersecret = data [0][5]
accesstoken = data [0][6]
tokensecret = data [0][7]
twitter = Twython(consumerkey, consumersecret, accesstoken, tokensecret)
tweet1 = request.form['inputTweet1']
tweet2 = request.form['inputTweet2']
tweet3 = request.form['inputTweet3']
tweet4 = request.form['inputTweet4']
tweet5 = request.form['inputTweet5']
tweet6 = request.form['inputTweet6']
Hash1 = request.form['inputHash1']
Hash2 = request.form['inputHash2']
Hash3 = request.form['inputHash3']
Hash4 = request.form['inputHash4']
fruits = [Hash1, Hash2, Hash3, Hash4]
list = [tweet1, tweet2, tweet3, tweet4, tweet5, tweet6]
def workit():
while True:
try:
if len(list) > 0:
z = random.randint(1, len(fruits))
a = random.sample(fruits, z)
b=" ".join(str(x) for x in a)
toTweet = list[random.randint(0,len(list))-1] + " " + b
twitter.update_status(status=toTweet)
time.sleep(10)
else:
twitter.update_status(status="Oh dear... I'm afraid I'm rather empty =(")
break
except TwythonError as e:
print (e)
if 'work_process' not in session:
process = Process(target=workit)
process.start()
pid = process.pid
parent_pid = psutil.Process(process.pid).parent().pid
session['work_process'] = (parent_pid, pid)
return redirect('/showscheduletweets')
#retweets
#app.route('/retweet',methods=['POST'])
def retweet():
if session.get('user'):
_username = request.form['inputEmail']
con = mysql.connect()
cursor = con.cursor()
cursor.callproc('sp_GetTwitter', (_username,))
data = cursor.fetchall()
session['user'] = data[0][0]
consumerkey = data [0][4]
consumersecret = data [0][5]
accesstoken = data [0][6]
tokensecret = data [0][7]
Retweet1 = request.form['inputRetweet1']
Retweet2 = request.form['inputRetweet2']
Retweet3 = request.form['inputRetweet3']
Retweet4 = request.form['inputRetweet4']
Exclude1 = request.form['inputExclude1']
Exclude2 = request.form['inputExclude2']
def work():
twitter = Twython(consumerkey, consumersecret, accesstoken, tokensecret)
naughty_words = [Exclude1, Exclude2]
good_words = [Retweet1, Retweet2, Retweet3, Retweet4]
filter = " OR ".join(good_words)
blacklist = " -".join(naughty_words)
keywords = filter +" -"+ blacklist
print(keywords)
while True:
search_results = twitter.search(q=keywords, count=10)
try:
for tweet in search_results["statuses"]:
try:
twitter.retweet(id = tweet["id_str"])
time.sleep(60)
except TwythonError as e:
print (e)
except TwythonError as e:
print (e)
if 'work_process' not in session:
process = Process(target=work)
process.start()
pid = process.pid
parent_pid = psutil.Process(process.pid).parent().pid
session['work_process'] = (parent_pid, pid)
return redirect('/showretweet')
#terminating scheduled tweets and retweets
#app.route('/stoptweet', methods=['POST'])
def stoptweet():
if 'work_process' in session:
parent_pid, pid = session['work_process']
try:
process = psutil.Process(pid)
if process.parent().pid == parent_pid:
process.terminate()
except psutil.NoSuchProcess:
pass
session.pop('work_process')
return render_template('index.html')
else:
return render_template('index.html')
if __name__ == '__main__':
app.run(host=os.getenv('IP', '0.0.0.0'),port=int(os.getenv('PORT', xxx)))
You might want to use celery python module, and move schedule tweet and retweet as background works.
For further info, see doc: http://flask.pocoo.org/docs/0.11/patterns/celery/
You will decorate those functions related to celery, rather than flask.
As example:
In your script:
import my_schedule_module
and then in my_schedule_module.py:
from celery import Celery, Task
from celery.result import AsyncResult
from celery.task.base import periodic_task
import sqlite3 # Here I use sqlite, can be sql
import redis # Here I am using redis, you can use another db as well > check documentation
from datetime import timedelta # used to schedule your background jobs, see in configuration below
app_schedule = Celery('my_schedule_module')
'''
Celery Configuration
'''
# a mockup configuration of your background jobs, as example use retweet each 60s
app_schedule.conf.update(
CELERY_ACCEPT_CONTENT = ['application/json'],
CELERY_TASK_SERIALIZER='json',
# CELERY_ACCEPT_CONTENT=['json'], # Ignore other content
CELERY_RESULT_SERIALIZER='json',
# CELERY_TIMEZONE='Europe/Oslo',
# CELERY_ENABLE_UTC=True,
CELERYD_TASK_TIME_LIMIT = 600,
CELERYD_TASK_SOFT_TIME_LIMIT = 600,
CELERYD_MAX_TASKS_PER_CHILD = 1000,
CELERYD_OPTS="--time-limit=600 --concurrency=4",
BROKER_URL = 'redis://localhost:6379/0',
CELERY_RESULT_BACKEND = 'redis://localhost',
CELERYBEAT_SCHEDULE = {
'add-every-60-seconds': {
'task': 'my_schedule_module.retweet',
'schedule': timedelta(seconds=60)
},
}
)
#app_schedule.task()
def retweet(tweet):
# your tweet function
#app_schedule.task()
def scheduletweets():
# your background job
# pseudo code
tweets = get_tweets()
process_tweet_list = []
for tweet in tweets:
process_tweet_list.append( retweet.s(tweet) )
job = group(process_tweet_list) #group is celery.group, see documentation
result = job.apply_async() # process job list async
print 'result', result.ready(), result.successful()
You can also use callback functions - as example, you might want to update datetime in your db of when your tweet was retweeted.
In this case, you would have a syntax like:
result = my_schedule_module.retweet.apply_async( (tweet,) , link=my_schedule_module.callback_to_store_results_of_retweet.s())
The below code has worked perfectly for the past 6 months for retrieving Followers of some Twitter Accounts. Suddenly this morning, the code has stopped to work returning 'Error 401 : Unauthorized'.
I checked my App on dev.twitter.com, it is still valid. I updated Tweepy. No idea why this is not working anymore.
Code Fails on 'Cursor.next' line.
import tweepy
import mysql.connector
import time
consumer_key = 'secretkey'
consumer_secret = 'secretsecret'
access_token = 'secrettoken'
access_token_secret = 'secrettokensecret'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
for twit_name in twit_name_array:
api = tweepy.API(auth)
t0= time.clock()
data = api.rate_limit_status()
user_followers_remaining = data['resources']['followers']['/followers/ids']['remaining']
print(user_followers_remaining)
id_i = twit_name[1]
countpage = 0
countx = 0
def limit_handled(cursor):
while True:
data = api.rate_limit_status()
user_followers_remaining = data['resources']['followers']['/followers/ids']['remaining']
if user_followers_remaining>0:
try:
yield cursor.next()
except BaseException as e:
print('failed_on_CURSOR_NEXT', str(e))
global api
api = tweepy.API(auth)
try:
yield cursor.next()
except BaseException as e:
print('failed_on_CURSOR_NEXT_2', str(e))
break
else:
for min_remain in range(-3, 0):
print('##### TIMEOUT ##### ----- Out of queries, waiting ' + str(min_remain*5) + 'min')
time.sleep(5*60)
As mentioned by #advance512, I had to login again to solve this issue. The following piece of code did the trick :
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
def limit_handled(cursor):
while True:
try:
yield cursor.next()
except BaseException as e:
print('failed_on_CURSOR_NEXT', str(e))
time.sleep(5)
global api
api = tweepy.API(auth)
yield cursor.next()
for followers in limit_handled(tweepy.Cursor(api.followers_ids, id = id_i).pages()):
for fll in followers:
process_follower(fll)
I'm playing around the Twitter API and am in the process of developing a script to pull all Tweets with a certain hashtag down to a local mongoDB. I have it working fine when I'm downloading tweets from users, but when downloading tweets from a hashtag I get:
return loads(fp.read(),
AttributeError: 'int' object has no attribute 'read'
Can anyone offer their infinite wisdom into how I could get this script to work?
To run, save it as a .py file, cd to the folder and run:
python twitter.py
Code:
__author__ = 'Tom Cusack'
import pymongo
import oauth2 as oauth
import urllib2, json
import sys, argparse, time
def oauth_header(url, consumer, token):
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time()),
}
req = oauth.Request(method = 'GET',url = url, parameters = params)
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(),consumer, token)
return req.to_header()['Authorization'].encode('utf-8')
def main():
### Twitter Settings
numtweets = '32000'
verbose = 'store_true'
retweet = 'store_false'
CONSUMER_KEY = 'M7Xu9Wte0eIZvqhb4G9HnIn3G'
CONSUMER_SECRET = 'c8hB4Qwps2aODQUx7UsyzQuCRifEp3PKu6hPQll8wnJGIhbKgZ'
ACCESS_TOKEN = '3213221313-APuXuNjVMbRbZpu6sVbETbgqkponGsZJVT53QmG'
ACCESS_SECRET = 'BJHrqWC9ed3pA5oDstSMCYcUcz2pYF3DmJ7jcuDe7yxvi'
base_url = url = 'https://api.twitter.com/1.1/search/tweets.json?include_entities=true&count=200&q=#mongodb&include_rts=%s' % (retweet)
oauth_consumer = oauth.Consumer(key = CONSUMER_KEY, secret = CONSUMER_SECRET)
oauth_token = oauth.Token(key = ACCESS_TOKEN, secret = ACCESS_SECRET)
### Mongodb Settings
uri = 'mongodb://127.0.0.1:27017/SARKY'
if uri != None:
try:
conn = pymongo.MongoClient(uri)
print 'Pulling Tweets..'
except:
print 'Error: Unable to connect to DB. Check uri variable.'
return
uri_parts = pymongo.uri_parser.parse_uri(uri)
db = conn[uri_parts['database']]
db['twitter-harvest'].ensure_index('id_str')
### Helper Variables for Harvest
max_id = -1
tweet_count = 0
stream = 0
### Begin Harvesting
while True:
auth = oauth_header(url, oauth_consumer, oauth_token)
headers = {"Authorization": auth}
request = urllib2.Request(url, headers = headers)
try:
stream = urllib2.urlopen(request)
except urllib2.HTTPError, err:
if err.code == 404:
print 'Error: Unknown user. Check --user arg'
return
if err.code == 401:
print 'Error: Unauthorized. Check Twitter credentials'
return
tweet_list = json.load(stream)
if len(tweet_list) == 0:
print 'No tweets to harvest!'
return
if 'errors' in tweet_list:
print 'Hit rate limit, code: %s, message: %s' % (tweets['errors']['code'], tweets['errors']['message'])
return
if max_id == -1:
tweets = tweet_list
else:
tweets = tweet_list[1:]
if len(tweets) == 0:
print 'Finished Harvest!'
return
for tweet in tweets:
max_id = id_str = tweet['id_str']
try:
if tweet_count == numtweets:
print 'Finished Harvest- hit numtweets!'
return
if uri != None:
db[user].update({'id_str':id_str},tweet,upsert = True)
else:
print tweet['text']
tweet_count+=1
if verbose == True and uri != None:
print tweet['text']
except Exception, err:
print 'Unexpected error encountered: %s' %(err)
return
url = base_url + '&max_id=' + max_id
if __name__ == '__main__':
try:
main()
except SystemExit as e:
if e.code == 0:
pass
You initially set stream = 0. When your try...except block catches a HTTP response with a code that isn't 404 or 401, stream is still equal to 0, but your except block doesn't break out of the function.
I'd look more closely at what this response says.
I have made an API on a remote server using bottle. When I start hosting, and then try and access the API using browser, the request gets no response. In fact, the request is not reaching the server itself. However, on using the same with wget, I am getting the required response. Here is the code:
from pymongo import MongoClient
import json
from twython import Twython
from bottle import route, request, response, run
client = MongoClient()
db = client.PWSocial
tweets = db.tweets
follower_count = db.follower_count
APP_KEY = 'XXXX'
APP_SECRET = 'XXXX'
OAUTH_TOKEN = 'XXXX'
OAUTH_TOKEN_SECRET = 'XXXX'
twitter = Twython(APP_KEY, APP_SECRET,OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
id_list = [57947109, 183093247, 89443197, 431336956]
#route('/')
def hello():
dict1 = {'me': 'hello'}
result = json.dumps(dict1)
return result
#route('/api/follower')
def disp_follower():
ac_id = request.query.id
fc = list(follower_count.find({'id': ac_id}))
mydict = fc[0]
del mydict['_id']
result = json.dumps(mydict)
return result
#route('/api/retweet')
def rt():
ac_id = request.query.id
retweets = db.retweets
rt = list(retweets.find({'usr_id': ac_id}))
result = json.dumps(rt)
return result
#route('/api/favorite')
def fav():
ac_id = request.query.id
retweets = db.retweets
rt = list(retweets.find({'id': ac_id}, sort=[('rt_count',-1)], limit= 100))
mydict = {}
for i in rt:
a = i.get('id')
status = twitter.show_status(id = a)
b = status.get('favorite_count')
mydict[a] = b
result = json.dumps(mydict)
return result
#route('/api/max_rt')
def most_rt():
ac_id = request.query.id
retweets = db.retweets
rt = list(retweets.find({'usr_id': ac_id}, {'usr_id':57947109},sort=[('rt_co
result = json.dumps(rt)
return result
run(host= '180.179.212.200', debug=True)
Can anyone suggest a reason and a solution for this?
Found the answer, its actually a simpple firewall issue. A hole has to be created in the firewall for any such API to run.