Twitter, multiple processes and database - python

I am a beginner writing a small twitter tool for scheduled tweets and automatic retweets in python/flask.
I got stuck with issues of processes running in the background.
I want scheduled tweets and retweets to work simultaneously in the background for a given user.
I want to be able to terminate these background processes running retweets/scheduled tweets separately from each other.
How would you change the code below to achieve this?
If you look at the code below now, it works, but user can not run scheduled tweets and retweets simultaneously. Also if user decides to terminate one of the processes, let us say retweets the other process terminates as well (scheduled tweets) and vice versa.
I thought about putting the identification data for a given process into a database and recalling this identification data from the database when there is a need to terminate it, instead of using cookies session, but I do not know how to implement this idea in code.
import ........
mysql = MySQL()
app = Flask(__name__)
app.secret_key = 'xxx'
app.config['MYSQL_DATABASE_USER'] = 'xxx'
app.config['MYSQL_DATABASE_PASSWORD'] = 'xxx'
app.config['MYSQL_DATABASE_DB'] = 'xxx'
app.config['MYSQL_DATABASE_HOST'] = '0.0.0.0'
mysql.init_app(app)
#app.route('/showSignin')
def showSignin():
if session.get('user'):
return redirect('/userHome')
else:
return render_template('signin.html')
#app.route('/showscheduletweets')
def showscheduletweets():
if session.get('user'):
return render_template('scheduletweets.html')
else:
return render_template('signin.html')
#app.route('/validateLogin',methods=['POST'])
def validateLogin():
try:
_username = request.form['inputEmail']
_password = request.form['inputPassword']
# connect to mysql
con = mysql.connect()
cursor = con.cursor()
cursor.callproc('sp_validateLogin',(_username,))
data = cursor.fetchall()
if len(data) > 0:
if check_password_hash(str(data[0][3]),_password):
session['user'] = data[0][0]
consumerkey = data [0][4]
consumersecret = data [0][5]
accesstoken = data [0][6]
tokensecret = data [0][7]
twitter = Twython(consumerkey, consumersecret, accesstoken, tokensecret)
twitter.update_status(status="xxx says hello.")
return render_template('userHome.html')
else:
return render_template('error.html',error = 'Wrong Email address or Password.')
else:
return render_template('error.html',error = 'Wrong Email address or Password.')
except Exception as e:
return render_template('error.html',error = str(e))
finally:
cursor.close()
con.close()
#schedule tweets
#app.route('/scheduletweets',methods=['POST'])
def scheduletweets():
if session.get('user'):
_username = request.form['inputEmail']
con = mysql.connect()
cursor = con.cursor()
cursor.callproc('sp_GetTwitter', (_username,))
data = cursor.fetchall()
session['user'] = data[0][0]
consumerkey = data [0][4]
consumersecret = data [0][5]
accesstoken = data [0][6]
tokensecret = data [0][7]
twitter = Twython(consumerkey, consumersecret, accesstoken, tokensecret)
tweet1 = request.form['inputTweet1']
tweet2 = request.form['inputTweet2']
tweet3 = request.form['inputTweet3']
tweet4 = request.form['inputTweet4']
tweet5 = request.form['inputTweet5']
tweet6 = request.form['inputTweet6']
Hash1 = request.form['inputHash1']
Hash2 = request.form['inputHash2']
Hash3 = request.form['inputHash3']
Hash4 = request.form['inputHash4']
fruits = [Hash1, Hash2, Hash3, Hash4]
list = [tweet1, tweet2, tweet3, tweet4, tweet5, tweet6]
def workit():
while True:
try:
if len(list) > 0:
z = random.randint(1, len(fruits))
a = random.sample(fruits, z)
b=" ".join(str(x) for x in a)
toTweet = list[random.randint(0,len(list))-1] + " " + b
twitter.update_status(status=toTweet)
time.sleep(10)
else:
twitter.update_status(status="Oh dear... I'm afraid I'm rather empty =(")
break
except TwythonError as e:
print (e)
if 'work_process' not in session:
process = Process(target=workit)
process.start()
pid = process.pid
parent_pid = psutil.Process(process.pid).parent().pid
session['work_process'] = (parent_pid, pid)
return redirect('/showscheduletweets')
#retweets
#app.route('/retweet',methods=['POST'])
def retweet():
if session.get('user'):
_username = request.form['inputEmail']
con = mysql.connect()
cursor = con.cursor()
cursor.callproc('sp_GetTwitter', (_username,))
data = cursor.fetchall()
session['user'] = data[0][0]
consumerkey = data [0][4]
consumersecret = data [0][5]
accesstoken = data [0][6]
tokensecret = data [0][7]
Retweet1 = request.form['inputRetweet1']
Retweet2 = request.form['inputRetweet2']
Retweet3 = request.form['inputRetweet3']
Retweet4 = request.form['inputRetweet4']
Exclude1 = request.form['inputExclude1']
Exclude2 = request.form['inputExclude2']
def work():
twitter = Twython(consumerkey, consumersecret, accesstoken, tokensecret)
naughty_words = [Exclude1, Exclude2]
good_words = [Retweet1, Retweet2, Retweet3, Retweet4]
filter = " OR ".join(good_words)
blacklist = " -".join(naughty_words)
keywords = filter +" -"+ blacklist
print(keywords)
while True:
search_results = twitter.search(q=keywords, count=10)
try:
for tweet in search_results["statuses"]:
try:
twitter.retweet(id = tweet["id_str"])
time.sleep(60)
except TwythonError as e:
print (e)
except TwythonError as e:
print (e)
if 'work_process' not in session:
process = Process(target=work)
process.start()
pid = process.pid
parent_pid = psutil.Process(process.pid).parent().pid
session['work_process'] = (parent_pid, pid)
return redirect('/showretweet')
#terminating scheduled tweets and retweets
#app.route('/stoptweet', methods=['POST'])
def stoptweet():
if 'work_process' in session:
parent_pid, pid = session['work_process']
try:
process = psutil.Process(pid)
if process.parent().pid == parent_pid:
process.terminate()
except psutil.NoSuchProcess:
pass
session.pop('work_process')
return render_template('index.html')
else:
return render_template('index.html')
if __name__ == '__main__':
app.run(host=os.getenv('IP', '0.0.0.0'),port=int(os.getenv('PORT', xxx)))

You might want to use celery python module, and move schedule tweet and retweet as background works.
For further info, see doc: http://flask.pocoo.org/docs/0.11/patterns/celery/
You will decorate those functions related to celery, rather than flask.
As example:
In your script:
import my_schedule_module
and then in my_schedule_module.py:
from celery import Celery, Task
from celery.result import AsyncResult
from celery.task.base import periodic_task
import sqlite3 # Here I use sqlite, can be sql
import redis # Here I am using redis, you can use another db as well > check documentation
from datetime import timedelta # used to schedule your background jobs, see in configuration below
app_schedule = Celery('my_schedule_module')
'''
Celery Configuration
'''
# a mockup configuration of your background jobs, as example use retweet each 60s
app_schedule.conf.update(
CELERY_ACCEPT_CONTENT = ['application/json'],
CELERY_TASK_SERIALIZER='json',
# CELERY_ACCEPT_CONTENT=['json'], # Ignore other content
CELERY_RESULT_SERIALIZER='json',
# CELERY_TIMEZONE='Europe/Oslo',
# CELERY_ENABLE_UTC=True,
CELERYD_TASK_TIME_LIMIT = 600,
CELERYD_TASK_SOFT_TIME_LIMIT = 600,
CELERYD_MAX_TASKS_PER_CHILD = 1000,
CELERYD_OPTS="--time-limit=600 --concurrency=4",
BROKER_URL = 'redis://localhost:6379/0',
CELERY_RESULT_BACKEND = 'redis://localhost',
CELERYBEAT_SCHEDULE = {
'add-every-60-seconds': {
'task': 'my_schedule_module.retweet',
'schedule': timedelta(seconds=60)
},
}
)
#app_schedule.task()
def retweet(tweet):
# your tweet function
#app_schedule.task()
def scheduletweets():
# your background job
# pseudo code
tweets = get_tweets()
process_tweet_list = []
for tweet in tweets:
process_tweet_list.append( retweet.s(tweet) )
job = group(process_tweet_list) #group is celery.group, see documentation
result = job.apply_async() # process job list async
print 'result', result.ready(), result.successful()
You can also use callback functions - as example, you might want to update datetime in your db of when your tweet was retweeted.
In this case, you would have a syntax like:
result = my_schedule_module.retweet.apply_async( (tweet,) , link=my_schedule_module.callback_to_store_results_of_retweet.s())

Related

Python simple DNS resolver: memory leak

This code is a DNS resolver that check from a DB for an entry not older than 5 minutes.
#!/usr/bin/python3
from MySQLdb import _mysql as MySQL
from dnslib import RR, QTYPE, RCODE, A
from dnslib.label import DNSLabel
from dnslib.server import DNSServer, BaseResolver
from time import sleep, time
class MariaResolver(BaseResolver):
DELTA = 300
def __init__(self):
self.password = "********************"
def resolve(self, request, handler):
reply = request.reply()
qname = request.q.qname
fqdn = str(request.q.qname)
try:
if fqdn.find("iut-") == -1:
reply.header.rcode = RCODE.REFUSED
else:
hostname = fqdn.split(".")[0]
timestamp = int(time()) - self.DELTA
query = "SELECT ip FROM dns WHERE record='{}' AND timestamp>{}"
db = MySQL.connect("localhost", "dns", self.password, "salles")
db.query(query.format(hostname, timestamp))
result = db.store_result()
row = result.fetch_row(how=1)
if row:
ip = row[0]["ip"].decode("utf-8")
reply.add_answer(RR(qname, QTYPE.A, ttl=0,
rdata=A(ip)))
else:
reply.header.rcode = RCODE.REFUSED
db.close()
except Exception as e:
print(e)
reply.header.rcode = RCODE.REFUSED
return reply
if __name__ == '__main__':
resolver = MariaResolver()
udp_server = DNSServer(resolver, port=53)
udp_server.start_thread()
while udp_server.isAlive():
sleep(0.1)
This code leaks over time and I do not understand why.
In the Proxmox screenshot, you can see service restarted at the and.

How to start a background Thread within a Flask app?

I wrote a Flask app that is working fine, and I wanted that while it is running, a separate background thread should parallel to it doing some stuff. The problem is, doing this doesn't spawn the thread at all, but I know that my code is right because using the exact same portion of the thread code on a simple python script works as intended.
app.py
weatherCollectorThread = WeatherDataCollectorThread()
...
if __name__ == '__main__':
try:
print("Starting Weather Collector Thread...")
weatherCollectorThread.start()
print("Starting the WebApp...")
app.run(debug=True)
except KeyboardInterrupt:
try:
weatherCollectorThread.stop()
except:
pass
WeatherDataCollectorThread Class
class WeatherDataCollectorThread:
def __init__(self):
self.weatherStations = DBHelper.get_weather_stations()
self.weatherApiKey = "REDACTED"
self.baseURL = "SOME URL"
self.isThreadRunning = False
self.result_log = open('results.log','a+')
def storeWeatherData(self,weather):
conn = DBHelper.get_connection()
cur = conn.cursor()
cur.execute("INSERT INTO weather_data(city,country,now_unixtime,last_updated_unixtime,temperature,isDay,condition_text,condition_icon,windspeed,winddir,pressure,precipitation,cloud,humidity) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",[weather['city'],weather['country'],weather['now_unixtime'],weather['last_updated_unixtime'],weather['temperature'],weather['isDay'],weather['condition_text'],weather['condition_icon'],weather['windspeed'],weather['winddir'],weather['pressure'],weather['precipitation'],weather['cloud'],weather['humidity']])
conn.commit()
conn.close()
def collectWeatherData(self):
self.isThreadRunning = True
while self.isThreadRunning:
for each_station in self.weatherStations:
if each_station['isWorking'] != 1:
continue
print("Sending request")
params = {'q':each_station['location'],'key':self.weatherApiKey}
resp = requests.get(url=self.baseURL,params=params)
print("Request received")
weatherData = json.loads(resp.text)
location = weatherData['location']
current = weatherData['current']
weather = {}
weather['city'] = location['name']
weather['country'] = location['country']
weather['now_unixtime'] = location['localtime_epoch']
weather['last_updated_unixtime'] = current['last_updated_epoch']
weather['temperature'] = current['temp_c']
weather['isDay'] = current['is_day']
weather['condition_text'] = current['condition']['text']
weather['condition_icon'] = current['condition']['icon']
weather['windspeed'] = current['wind_kph']
weather['winddir'] = current['wind_dir']
weather['pressure'] = current['pressure_mb']
weather['precipitation'] = current['precip_mm']
weather['cloud'] = current['cloud']
weather['humidity'] = current['humidity']
self.storeWeatherData(weather)
print("Data stored\n" + '-'*24)
self.result_log.write(resp.text + '\n')
sleep(60)
def start(self):
self.thread = Thread(target=self.collectWeatherData)
self.thread.start()
def join_instrument(self,session):
conn = DBHelper.get_connection()
cur = conn.cursor()
cur.execute("UPDATE weather_stations SET isWorking=1 WHERE weatherStationID=?",[session['weatherStationID']])
conn.commit()
conn.close()
def detach_instrument(self,session):
conn = DBHelper.get_connection()
cur = conn.cursor()
cur.execute("UPDATE weather_stations SET isWorking=0 WHERE weatherStationID=?",[session['weatherStationID']])
conn.commit()
conn.close()
def stop(self):
self.result_log.close()
self.isThreadRunning = False
So I figured out the solution.
You see, when you use flask run to run your web-app, it ignores every single function call in the script and parses through the decorators and starts the app on its own. So, if you do something like:
if __name__ == '__main__':
app.start()
someOtherFunction()
Neither the app.start() nor the someOtherFunction() would start.
So the solution?
Simply use python3 app.py to run the script.
... yes, it's that simple :|

How to mock functionality of boto3 module using pytest

I have a custom module written called sqs.py. The script will do the following:
Get a message from AWS SQS
Get the AWS S3 path to delete
Delete the path
Send a confirmation email to the user
I'm trying to write unit tests for this module that will verify the code will execute as expected and that it will raise exceptions when they do occur.
This means I will need to mock the response from Boto3 calls that I make. My problem is that the code will first establish the SQS client to obtain the message and then a second call to establish the S3 client. I'm not sure how to mock these 2 independent calls and be able to fake a response so I can test my script's functionality. Perhaps my approach is incorrect. At any case, any advice on how to do this properly is appreciated.
Here's how the code looks like:
import boto3
import json
import os
import pprint
import time
import asyncio
import logging
from send_email import send_email
queue_url = 'https://xxxx.queue.amazonaws.com/1234567890/queue'
def shutdown(message):
""" Sends shutdown command to OS """
os.system(f'shutdown +5 "{message}"')
def send_failure_email(email_config: dict, error_message: str):
""" Sends email notification to user with error message attached. """
recipient_name = email_config['recipient_name']
email_config['subject'] = 'Subject: Restore Failed'
email_config['message'] = f'Hello {recipient_name},\n\n' \
+ 'We regret that an error has occurred during the restore process. ' \
+ 'Please try again in a few minutes.\n\n' \
+ f'Error: {error_message}.\n\n' \
try:
send_email(email_config)
except RuntimeError as error_message:
logging.error(f'ERROR: cannot send email to user. {error_message}')
async def restore_s3_objects(s3_client: object, p_bucket_name: str, p_prefix: str):
"""Attempts to restore objects specified by p_bucket_name and p_prefix.
Returns True if restore took place, false otherwise.
"""
is_truncated = True
key_marker = None
key = ''
number_of_items_restored = 0
has_restore_occured = False
logging.info(f'performing restore for {p_bucket_name}/{p_prefix}')
try:
while is_truncated == True:
if not key_marker:
version_list = s3_client.list_object_versions(
Bucket = p_bucket_name,
Prefix = p_prefix)
else:
version_list = s3_client.list_object_versions(
Bucket = p_bucket_name,
Prefix = p_prefix,
KeyMarker = key_marker)
if 'DeleteMarkers' in version_list:
logging.info('found delete markers')
delete_markers = version_list['DeleteMarkers']
for d in delete_markers:
if d['IsLatest'] == True:
key = d['Key']
version_id = d['VersionId']
s3_client.delete_object(
Bucket = p_bucket_name,
Key = key,
VersionId = version_id
)
number_of_items_restored = number_of_items_restored + 1
is_truncated = version_list['IsTruncated']
logging.info(f'is_truncated: {is_truncated}')
if 'NextKeyMarker' in version_list:
key_marker = version_list['NextKeyMarker']
if number_of_items_restored > 0:
has_restore_occured = True
return has_restore_occured
except Exception as error_message:
raise RuntimeError(error_message)
async def main():
if 'AWS_ACCESS_KEY_ID' in os.environ \
and 'AWS_SECRET_ACCESS_KEY' in os.environ \
and os.environ['AWS_ACCESS_KEY_ID'] != '' \
and os.environ['AWS_SECRET_ACCESS_KEY'] != '':
sqs_client = boto3.client(
'sqs',
aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
verify=False
)
s3_client = boto3.client(
's3',
aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
verify=False
)
else:
sqs_client = boto3.client(
'sqs',
verify=False,
)
s3_client = boto3.client(
's3',
verify=False,
)
received_message = sqs_client.receive_message(
QueueUrl=queue_url,
AttributeNames=['All'],
VisibilityTimeout=10,
WaitTimeSeconds=20, # Wait up to 20 seconds for a message to arrive
)
if 'Messages' in received_message \
and len(received_message['Messages']) > 0:
# NOTE: Initialize email configuration
receipient_email = 'support#example.com'
username = receipient_email.split('#')[0]
fullname_length = len(username.split('.'))
fullname = f"{username.split('.')[0]}" # Group name / First name only
if (fullname_length == 2): # First name and last name available
fullname = f"{username.split('.')[0]} {username.split('.')[1]}"
fullname = fullname.title()
email_config = {
'destination': receipient_email,
'recipient_name': fullname,
'subject': 'Subject: Restore Complete',
'message': ''
}
try:
receipt_handle = received_message['Messages'][0]['ReceiptHandle']
except Exception as error_message:
logging.error(error_message)
send_failure_email(email_config, error_message)
shutdown(f'{error_message}')
try:
data = received_message['Messages'][0]['Body']
data = json.loads(data)
logging.info('A SQS message for a restore has been received.')
except Exception as error_message:
message = f'Unable to obtain and parse message body. {error_message}'
logging.error(message)
send_failure_email(email_config, message)
shutdown(f'{error_message}')
try:
bucket = data['bucket']
prefix = data['prefix']
except Exception as error_message:
message = f'Retrieving bucket name and prefix failed. {error_message}'
logging.error(message)
send_failure_email(email_config, message)
shutdown(f'{error_message}')
try:
logging.info(f'Initiating restore for path: {bucket}/{prefix}')
restore_was_performed = await asyncio.create_task(restore_s3_objects(s3_client, bucket, prefix))
if restore_was_performed is True:
email_config['message'] = f'Hello {fullname},\n\n' \
+ f'The files in the path \'{bucket}/{prefix}\' have been restored. ' \
send_email(email_config)
logging.info('Restore complete. Shutting down.')
else:
logging.info('Path does not require restore. Shutting down.')
shutdown(f'shutdown +5 "Restore successful! System will shutdown in 5 mins"')
except Exception as error_message:
message = f'File restoration failed. {error_message}'
logging.error(message)
send_failure_email(email_config, message)
shutdown(f'{error_message}')
try:
sqs_client.delete_message(
QueueUrl=queue_url,
ReceiptHandle=receipt_handle,
)
except Exception as error_message:
message = f'Deleting restore session from SQS failed. {error_message}'
logging.error(message)
send_failure_email(email_config, message)
shutdown(f'{error_message}')
if __name__ == '__main__':
logging.basicConfig(filename='restore.log',level=logging.INFO)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
The only way I was able to mock Boto3 is rebuilding a small class that represents the actual method structure. This is because Boto3 uses dynamic methods and all the resource level methods are created at runtime.
This might not be industry standard but I wasn't able to get any of the methods I found on the internet to work most of the time and this worked pretty well for me and requires minimal effort (comparing to some of the solutions I found).
class MockClient:
def __init__(self, region_name, aws_access_key_id, aws_secret_access_key):
self.region_name = region_name
self.aws_access_key_id = aws_access_key_id
self.aws_secret_access_key = aws_secret_access_key
self.MockS3 = MockS3()
def client(self, service_name, **kwargs):
return self.MockS3
class MockS3:
def __init__(self):
self.response = None # Test your mock data from S3 here
def list_object_versions(self, **kwargs):
return self.response
class S3TestCase(unittest.TestCase):
def test_restore_s3_objects(self):
# Given
bucket = "testBucket" # Test this to something that somewahat realistic
prefix = "some/prefix" # Test this to something that somewahat realistic
env_vars = mock.patch.dict(os.environ, {"AWS_ACCESS_KEY_ID": "abc",
"AWS_SECRET_ACCESS_KEY": "def"})
env_vars.start()
# initialising the Session can be tricy since it has to be imported from
# the module/file that creates the session on actual code rather than
# where's a Session code is. In this case you might have to import from
# main rather than boto3.
boto3.session.Session = mock.Mock(side_effect=[
MockClient(region_name='eu-west-1',
aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'])])
s3_client = boto3.client('s3', verify=False)
# When
has_restore_occured = restore_s3_objects(s3_client, bucket, prefix)
# Then
self.assertEqual(has_restore_occured, False) # your expected result set
env_vars.stop()

Why supervisor restarts my script when it sleeps?

I have a python script to get all the followers and friends on Twitter. I use supervisor to manage the process of the script. One thing I notice is that supervisor will restart the script as it sleeps to wait for the rate limit of Twitter to clear up. How do I stop that?
This is my script.
#!/usr/bin/env python
import pymongo
import tweepy
from pymongo import MongoClient
from sweepy.get_config import get_config
config = get_config()
consumer_key = config.get('PROCESS_TWITTER_CONSUMER_KEY')
consumer_secret = config.get('PROCESS_TWITTER_CONSUMER_SECRET')
access_token = config.get('PROCESS_TWITTER_ACCESS_TOKEN')
access_token_secret = config.get('PROCESS_TWITTER_ACCESS_TOKEN_SECRET')
MONGO_URL = config.get('MONGO_URL')
MONGO_PORT = config.get('MONGO_PORT')
MONGO_USERNAME = config.get('MONGO_USERNAME')
MONGO_PASSWORD = config.get('MONGO_PASSWORD')
client = MongoClient(MONGO_URL, int(MONGO_PORT))
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, retry_count=3)
db = client.tweets
db.authenticate(MONGO_USERNAME, MONGO_PASSWORD)
raw_tweets = db.raw_tweets
users = db.users
def is_user_in_db(user_id):
return get_user_from_db(user_id) is None
def get_user_from_db(user_id):
return users.find_one({'user.id' : user_id})
def get_user_from_twitter(user_id):
return api.get_user(user_id)
def get_followers(user_id):
users = []
for i, page in enumerate(tweepy.Cursor(api.followers, id=user_id, count=200).pages()):
print 'Getting page {} for followers'.format(i)
users += page
return users
def get_friends(user_id):
users = []
for i, page in enumerate(tweepy.Cursor(api.friends, id=user_id, count=200).pages()):
print 'Getting page {} for friends'.format(i)
users += page
return users
def get_followers_ids(user_id):
ids = []
for i, page in enumerate(tweepy.Cursor(api.followers_ids, id=user_id, count=5000).pages()):
print 'Getting page {} for followers ids'.format(i)
ids += page
return ids
def get_friends_ids(user_id):
ids = []
for i, page in enumerate(tweepy.Cursor(api.friends_ids, id=user_id, count=5000).pages()):
print 'Getting page {} for friends ids'.format(i)
ids += page
return ids
def process_user(user):
user_id = user['id']
screen_name = user['screen_name']
print 'Processing user : {}'.format(screen_name)
the_user = get_user_from_db(user_id)
if the_user is None:
user['followers_ids'] = get_followers_ids(screen_name)
user['friends_ids'] = get_friends_ids(screen_name)
users.insert_one(user)
if __name__ == "__main__":
for doc in raw_tweets.find({'processed' : {'$exists': False}}):
print 'Start processing'
try:
process_user(doc['user'])
except KeyError:
pass
try:
process_user(doc['retweeted_status']['user'])
except KeyError:
pass
raw_tweets.update_one({'_id': doc['_id']}, {'$set':{'processed':True}})
When rate limit is hit Tweepy will sleep in and wait for it. I would get this message.
Rate limit reached. Sleeping for: 896
However, supervisor somehow restart the script and run it again so the script will never finish. How do I stop that?
This is my supervisor configuration.
[program:twitter_processer]
command=/usr/bin/python -u /home/ubuntu/processer.py
directory=/home/ubuntu
autostart=true
autorestart=true
startretries=3
stderr_logfile=/home/ubuntu/processer.err.log
stdout_logfile=/home/ubuntu/processer.out.log
user=ubuntu

Python script to harvest tweets to a MongoDb works with users but not hashtags. Any ideas why not?

I'm playing around the Twitter API and am in the process of developing a script to pull all Tweets with a certain hashtag down to a local mongoDB. I have it working fine when I'm downloading tweets from users, but when downloading tweets from a hashtag I get:
return loads(fp.read(),
AttributeError: 'int' object has no attribute 'read'
Can anyone offer their infinite wisdom into how I could get this script to work?
To run, save it as a .py file, cd to the folder and run:
python twitter.py
Code:
__author__ = 'Tom Cusack'
import pymongo
import oauth2 as oauth
import urllib2, json
import sys, argparse, time
def oauth_header(url, consumer, token):
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time()),
}
req = oauth.Request(method = 'GET',url = url, parameters = params)
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(),consumer, token)
return req.to_header()['Authorization'].encode('utf-8')
def main():
### Twitter Settings
numtweets = '32000'
verbose = 'store_true'
retweet = 'store_false'
CONSUMER_KEY = 'M7Xu9Wte0eIZvqhb4G9HnIn3G'
CONSUMER_SECRET = 'c8hB4Qwps2aODQUx7UsyzQuCRifEp3PKu6hPQll8wnJGIhbKgZ'
ACCESS_TOKEN = '3213221313-APuXuNjVMbRbZpu6sVbETbgqkponGsZJVT53QmG'
ACCESS_SECRET = 'BJHrqWC9ed3pA5oDstSMCYcUcz2pYF3DmJ7jcuDe7yxvi'
base_url = url = 'https://api.twitter.com/1.1/search/tweets.json?include_entities=true&count=200&q=#mongodb&include_rts=%s' % (retweet)
oauth_consumer = oauth.Consumer(key = CONSUMER_KEY, secret = CONSUMER_SECRET)
oauth_token = oauth.Token(key = ACCESS_TOKEN, secret = ACCESS_SECRET)
### Mongodb Settings
uri = 'mongodb://127.0.0.1:27017/SARKY'
if uri != None:
try:
conn = pymongo.MongoClient(uri)
print 'Pulling Tweets..'
except:
print 'Error: Unable to connect to DB. Check uri variable.'
return
uri_parts = pymongo.uri_parser.parse_uri(uri)
db = conn[uri_parts['database']]
db['twitter-harvest'].ensure_index('id_str')
### Helper Variables for Harvest
max_id = -1
tweet_count = 0
stream = 0
### Begin Harvesting
while True:
auth = oauth_header(url, oauth_consumer, oauth_token)
headers = {"Authorization": auth}
request = urllib2.Request(url, headers = headers)
try:
stream = urllib2.urlopen(request)
except urllib2.HTTPError, err:
if err.code == 404:
print 'Error: Unknown user. Check --user arg'
return
if err.code == 401:
print 'Error: Unauthorized. Check Twitter credentials'
return
tweet_list = json.load(stream)
if len(tweet_list) == 0:
print 'No tweets to harvest!'
return
if 'errors' in tweet_list:
print 'Hit rate limit, code: %s, message: %s' % (tweets['errors']['code'], tweets['errors']['message'])
return
if max_id == -1:
tweets = tweet_list
else:
tweets = tweet_list[1:]
if len(tweets) == 0:
print 'Finished Harvest!'
return
for tweet in tweets:
max_id = id_str = tweet['id_str']
try:
if tweet_count == numtweets:
print 'Finished Harvest- hit numtweets!'
return
if uri != None:
db[user].update({'id_str':id_str},tweet,upsert = True)
else:
print tweet['text']
tweet_count+=1
if verbose == True and uri != None:
print tweet['text']
except Exception, err:
print 'Unexpected error encountered: %s' %(err)
return
url = base_url + '&max_id=' + max_id
if __name__ == '__main__':
try:
main()
except SystemExit as e:
if e.code == 0:
pass
You initially set stream = 0. When your try...except block catches a HTTP response with a code that isn't 404 or 401, stream is still equal to 0, but your except block doesn't break out of the function.
I'd look more closely at what this response says.

Categories

Resources