passing psycopg2 cursor to tweepy on_status() method - python

I'm trying to pass a psycopg2 cursor to a tweepy stream.
The connection pool and cursors are configured in a seperate file. Only the cursor is passed as an argument to the main pipeline function in another file called get_tweet_topic.py. I need the cursor in the on_status() method because I have a query there that needs it for execution.
I cannot figure out how to pass the cursor onto the on_status() method in the MyStreamListener() class.
the error I get is:
2020-03-05T22:16:24.856945+00:00 app[worker.1]: self._target(*self._args, **self._kwargs)
2020-03-05T22:16:24.856945+00:00 app[worker.1]: File "/app/get_tweet_topic.py", line 81, in guess_topic_pipeline
2020-03-05T22:16:24.856946+00:00 app[worker.1]: status_streams.streaming_pipeline(api, cursor)
2020-03-05T22:16:24.856947+00:00 app[worker.1]: File "/app/status_streams.py", line 100, in streaming_pipeline
2020-03-05T22:16:24.856947+00:00 app[worker.1]: general_stream(api, cursor)
2020-03-05T22:16:24.856948+00:00 app[worker.1]: File "/app/status_streams.py", line 86, in general_stream
2020-03-05T22:16:24.856948+00:00 app[worker.1]: myStreamListener = MyStreamListener()
2020-03-05T22:16:24.856948+00:00 app[worker.1]: TypeError: __init__() missing 1 required positional argument: 'cursor'
Code:
status_streams.py:
import tweepy
import os
import db_queries
import follow
#define class for the stream listener
class MyStreamListener(tweepy.StreamListener):
def __init__(self, cursor):
super().__init__()
self.cursor = cursor
#set counter to only get 1200 tweets
self.counter = 0
self.max = 1200
#get tweets
def on_status(self, status):
if not status.retweeted:
status_dict = {'created_at': status.created_at.strftime('%y-%m-&d %H:%M'),
'source_stream': 'general stream',
'status_id': status.id_str,
'user_id': status.user.id_str,
'screen_name': status.user.name,
'tweet_text': status.text,
'num_likes': status.favorite_count,
'num_retweets': status.retweet_count}
created_at = status_dict['created_at']
source_stream = status_dict['source_stream']
status_id = status_dict['status_id']
user_id = status_dict['user_id']
screen_name = status_dict['screen_name']
tweet_text = status_dict['tweet_text']
num_likes = status_dict['num_likes']
num_retweets = status_dict['num_retweets']
db_queries.insert_raw_tweets_table(cursor, created_at, source_stream, status_id, user_id, screen_name, tweet_text, num_likes, num_retweets)
self.counter +=1
if self.counter == self.max:
return False
#get tweets from list of followers
def following_stream(api, cursor, user_name):
try:
for status in tweepy.Cursor(api.user_timeline, tweet_mode='extended', include_rts=False, screen_name=user_name).items(1):
#ignore retweets
if not status.retweeted:
status_dict = {'created_at': status.created_at.strftime('%y-%m-%d %H:%M'),
'source_stream': 'following stream',
'status_id': status.id_str,
'user_id': status.user.id_str,
'screen_name': status.user.name,
'tweet_text':status.full_text,
'num_likes':status.favorite_count,
'num_retweets':status.retweet_count}
created_at = status_dict['created_at']
source_stream = status_dict['source_stream']
status_id = status_dict['status_id']
user_id = status_dict['user_id']
screen_name = status_dict['screen_name']
tweet_text = status_dict['tweet_text']
num_likes = status_dict['num_likes']
num_retweets = status_dict['num_retweets']
db_queries.insert_raw_tweets_table(cursor, created_at, source_stream, status_id, user_id, screen_name, tweet_text, num_likes, num_retweets)
#function that controls both streams
def streaming_pipeline(api, cursor):
#get list of all users that are currently followed
#iterate through the following_list and grab the single latest tweet
following_list = follow.get_following(api)
for user in following_list:
f_stream = following_stream(api, cursor, user)
#stream class is used here
myStreamListener = MyStreamListener()
stream = tweepy.Stream(auth=api.auth, listener=myStreamListener(cursor=self.cursor))
stream.filter(languages=['en'], track=['the'])
cursor.close()
relevant section of get_tweet_topic.py:
def guess_topic_pipeline(api, conn, model, corpus, classifier):
while True:
cursor = conn.cursor()
db_queries.create_temp_tweets_table(cursor)
conn.commit()
#use pipeline to grab tweets off twitter
print('Retrieving statuses from streams...')
status_streams.streaming_pipeline(api, cursor)
print('Done retrieving...')
relevant portion of connection pooling code:
#get connection from pool, pass cursor as an argument, start topic extration thread
topic_conn = conn_pool.getconn()
topic_extraction_thread = Thread(target=get_tweet_topic.guess_topic_pipeline, kwargs={'api':api, 'conn': topic_conn, 'model': lda_model, 'corpus': lda_id2word, 'classifier': lda_huber_classifier})
topic_extraction_thread.start()
#return connection when done
conn_pool.putconn(topic_conn)
insert_raw_tweets_table() function with actual query:
def insert_raw_tweets_table(cursor, createdAt, sourceStream, statusID, userID, screenName, tweetText, numLikes, numRetweets):
cursor.execute('INSERT INTO tempTweets(createdAt, sourceStream, statusID, userID, screenName, tweetText) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)', (createdAt, sourceStream, statusID, userID, screenName, tweetText, numLikes, numRetweets))

#MauriceMeyer answered the question in the comments but here is the working code for clarity.
I forgot to reference the cursor as self.cursor within the class and I forgot to pass the cursor as an argument when creating an instance of the class. I was passing the cursor as an argument after creating the instance, which is not correct.
Correct code:
class MyStreamListener(tweepy.StreamListener):
def __init__(self, cursor):
super().__init__()
self.cursor = cursor
#set counter to only get 1200 tweets
self.counter = 0
self.max = 1200
#get tweets
def on_status(self, status):
if not status.retweeted:
status_dict = {'created_at' : status.created_at.strftime('%y-%m-&d %H:%M'),
'source_stream' : 'general stream',
'status_id' : status.id_str,
'user_id' : status.user.id_str,
'screen_name' : status.user.name,
'tweet_text' : status.text,
'num_likes' : status.favorite_count,
'num_retweets' : status.retweet_count}
created_at = status_dict['created_at']
source_stream = status_dict['source_stream']
status_id = status_dict['status_id']
user_id = status_dict['user_id']
screen_name = status_dict['screen_name']
tweet_text = status_dict['tweet_text']
num_likes = status_dict['num_likes']
num_retweets = status_dict['num_retweets']
#▼ reference self.cursor here
db_queries.insert_raw_tweets_table(self.cursor, created_at, source_stream, status_id, user_id, screen_name, tweet_text, num_likes, num_retweets)
self.counter +=1
if self.counter == self.max:
return False
#stream class is used here ▼ reference cursor here
myStreamListener = MyStreamListener(cursor)
#▼ removed reference to cursor here
stream = tweepy.Stream(auth=api.auth, listener=myStreamListener)
stream.filter(languages=['en'], track=['the'])

Related

post excel data into mysqldb in python

while inserting the data into mysql database, it is throwing the "MySQLdb._exceptions.OperationalError: (1046, 'No database selected')". I have checked step by step in this code snippet. database has been connected. But it is showing above error.
Here is my database created query.
create table database_conn(id int,name varchar(20),age int(100),address varchar(50),state varchar(10), zipcode int(20));
Here is my python code snippet for inserting data into database.
class Database_conn:
def __init__(self,host,user,password,database,file_name):
self.host = host
self.user = user
self.password = password
self.database = database
self.file_name = file_name
self.conn()
self.excel_sheet()
self.db_query()
def conn(self):
self.mydb = MySQLdb.connect(host=self.host,
user = self.user,password = self.password)
def excel_sheet(self):
self.book = xlrd.open_workbook(self.file_name)
self.sheet = self.book.sheet_by_name("Sheet1")
try:
self.cursor = self.mydb.cursor()
print('established cursor connection')
except:
print('something goes worng')
def db_query(self):
global values, conn
self.query = """INSERT INTO database_conn(id,name,age,address,state,zipcode)VALUES(%s,%s,%s,%s,%s,%s)"""
for i in range(1,self.sheet.nrows):
id = self.sheet.cell(i,0).value
name = self.sheet.cell(i,1).value
age = self.sheet.cell(i,2).value
address = self.sheet.cell(i,3).value
state = self.sheet.cell(i,4).value
zipcode = self.sheet.cell(i,5).value
values = (id, name, age, address, state, zipcode)
conn = self.cursor
conn.execute(self.query, values)
conn.commit()
conn.close()
file_name = input('Enter a file_name : ')
d=Database_conn('localhost','****','****','****',file_name)
In above code i am getting the error in this line
conn.execute(self.query, values)
Finally i did insert the data into my db.
Here is my Answer:
class Database_conn:
def __init__(self,host,user,password,database,file_name):
self.host = host
self.user = user
self.password = password
self.database = database
self.file_name = file_name
self.conn()
self.excel_sheet()
self.db_query()
self.db_close()
def conn(self):
self.mydb = MySQLdb.connect(host=self.host,
user = self.user,password = self.password,db=self.database)
def excel_sheet(self):
self.book = xlrd.open_workbook(self.file_name)
self.sheet = self.book.sheet_by_name("Sheet1")
try:
self.cursor = self.mydb.cursor()
print('established cursor connection')
except:
print('something goes worng')
def db_query(self):
global values, conn
self.query = """INSERT INTO database_conn(id,name,age,address,state,zipcode)VALUES(%s,%s,%s,%s,%s,%s)"""
for i in range(1,self.sheet.nrows):
id = self.sheet.cell(i,0).value
name = self.sheet.cell(i,1).value
age = self.sheet.cell(i,2).value
address = self.sheet.cell(i,3).value
state = self.sheet.cell(i,4).value
zipcode = self.sheet.cell(i,5).value
values = (id, name, age, address, state, zipcode)
conn = self.cursor
conn.execute(self.query, values)
conn.close()
def db_close(self):
self.database.commit()
self.database.commit()
file_name = input('Enter a file_name : ')
d=Database_conn('localhost','****','****','*****',file_name)

Do I need to refresh the sqlalchemy connection object to be able to insert data to newly created table?

I am a beginner in sqlalchemy.
My connection function in _core.py
from sqlalchemy import create_engine
from methodtools import lru_cache
#lru_cache(maxsize=16)
def get_engine(db="homelan"):
qs = 'mysql+pymysql://user:pwd#localhost/{db}'.format(db=db)
engine = create_engine(qs)
connection = engine.connect()
return engine, connection
in my code if the table does not exist for a specific host machine I create. as shown below:
server_status.py
class HostStatusManager(object):
keep_record = 10 # days
"""This class contains methods to manage the status of the host
registered in database for supervision or monitoring purpose.
"""
def __init__(self, ip_address):
super(HostStatusManager, self).__init__()
self._ip = ip_address
engine, connection = _core.get_engine()
self._engine = engine
self._connection = connection
self._host_table = None
self._host_table_name = None
if not self.host_status_table_exists():
self._host_table = self._create_table()
def get_status(self):
"""Gets the latest status of the host whether online or offline.
"""
columns = self._host_table.columns
print("Cols: ".format(columns))
select_field = getattr(columns, "status")
query = db.select(
[select_field]
).order_by(
db.desc(
getattr(columns, "id")
)
).limit(1)
_log.debug(query)
ResultProxy = self._connection.execute(query)
ResultSet = ResultProxy.fetchall()
if ResultSet:
return ResultSet[0][0]
_log.warning("No existing status found from {0}.".format(
self._host_table
)
)
def set_status(self, data):
query = db.insert(self._host_table).values(**data)
results = self._connection.execute(query)
If I directly call set_status it works fine but the get_status throws error saying:
pymysql.err.InternalError: (1412, 'Table definition has changed,
please retry transaction')
You shouldn't be using an lru cache to store connections, but rather use the engine's built in connection pool. Then, every time you need to talk to the database, ask for a connection from the engine, and close the connection when you're done with it. The engine will by default have a pool of size 5.
from sqlalchemy import create_engine
def get_engine(db="homelan"):
qs = 'mysql+pymysql://user:pwd#localhost/{db}'.format(db=db)
engine = create_engine(qs)
return engine
class HostStatusManager(object):
keep_record = 10 # days
"""This class contains methods to manage the status of the host
registered in database for supervision or monitoring purpose.
"""
def __init__(self, ip_address):
super(HostStatusManager, self).__init__()
self._ip = ip_address
engine, connection = _core.get_engine()
self._engine = engine
self._host_table = None
self._host_table_name = None
if not self.host_status_table_exists():
self._host_table = self._create_table()
def get_status(self):
"""Gets the latest status of the host whether online or offline.
"""
columns = self._host_table.columns
print("Cols: ".format(columns))
select_field = getattr(columns, "status")
query = db.select(
[select_field]
).order_by(
db.desc(
getattr(columns, "id")
)
).limit(1)
_log.debug(query)
connection = self._engine.connect()
try:
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
if ResultSet:
return ResultSet[0][0]
_log.warning("No existing status found from {0}.".format(
self._host_table
)
)
finally:
connection.close()
def set_status(self, data):
query = db.insert(self._host_table).values(**data)
connection = self._engine.connect()
try:
results = connection.execute(query)
finally:
connection.close()

Python uncaught exception when inserting data into mysql database

I have a python AWS lambda function that takes JSON records, checks them to see if they have required keys, and then inserts into a MySQL db (AWS RDS Aurora). The function gets invoked whenever a new record comes into the stream def handler.
At the moment, Lambda is reporting some errors, but when I look at cloudwatch logs I don't see any errors, which leads me to believe that maybe I'm not handling or catching the exception. Can anyone tell me where the issue might be?
from __future__ import print_function
import base64
import json
import pymysql
RDS_HOST = 'host'
DB_USER = 'dummy_user'
DB_PASSWORD = 'password1234'
DB_NAME = 'crazy_name'
DB_TABLE = 'wow_table'
class MYSQL(object):
'''
This a wrapper Class for PyMySQL
'''
CONNECTION_TIMEOUT = 30
def __init__(self, host, user, password, database, table):
self.host = host
self.user = user
self.password = password
self.database = database
self.table = table
self.connection = self.connect()
def connect(self):
'''
Connects to MySQL instance
'''
try:
connection = pymysql.connect(
host=self.host,
user=self.user,
password=self.password,
db=self.database,
connect_timeout=self.CONNECTION_TIMEOUT
)
return connection
except Exception as ex:
print(ex)
print("ERROR: Unexpected error: Could not connect to AuroraDB instance")
def execute(self, account_id, external_ref_id, timestamp):
'''
Executes command given a MySQL connection
'''
with self.connection.cursor() as cursor:
sql = ('INSERT INTO ' +
self.database +
'.' +
self.table +
'(`account_id`, `external_reference_id`, `registration`, `c_name`, `c_id`, `create_date`)' +
' VALUES (%s, %s, DATE_FORMAT(STR_TO_DATE(%s,"%%Y-%%M-%%d %%H:%%i:%%s"),"%%Y-%%m-%%d %%H:%%i:%%s"), %s, %s, current_timestamp())' +
' ON DUPLICATE KEY UPDATE create_date = VALUES(create_date)')
cursor.execute(sql, (
account_id,
external_ref_id,
timestamp,
'bingo',
300)
)
self.connection.commit()
def close_connection(self):
'''
Closes connection to MySQL
'''
self.connection.close()
def get_data_from_kinesis_object(obj):
'''
Retrieves data from kinesis event
'''
return obj['kinesis']['data']
def decode_data(data):
'''
Decodes record via base64
'''
return base64.b64decode(data)
def split_records_into_record(records):
'''
Splits a record of records into an array of records
'''
return records.split('\n')
def parse_record(record):
'''
parses record into JSON
'''
if record:
return json.loads(record)
def is_record_valid(record):
'''
Check for keys in event
returns True if they all exist
and False if they dont all exist
'''
return all(key in record for key in (
'eventName',
'sourceType',
'AccountId',
'Timestamp',
'ExternalReferenceId'
))
def handler(event, context):
"""
This function inserts data into Aurora RDS instance
"""
mysql = MYSQL(RDS_HOST, DB_USER, DB_PASSWORD, DB_NAME, DB_TABLE)
for obj in event['Records']:
records = decode_data(get_data_from_kinesis_object(obj))
split_records = split_records_into_record(records)
for record in split_records:
parsed_record = parse_record(record)
if is_record_valid(parsed_record):
mysql.execute(
parsed_record['AccountId'],
parsed_record['ExternalReferenceId'],
str(parsed_record['Timestamp'])
)
mysql.close_connection()

Login into Server AND MySQL from Python

I'm trying to login to my MySQL server that I'm running on DigitalOcean, but unfortunately I have no clue as to how to push the login through python. I've got the MySQL part implemented, but don't know how to login to the actual server itself (the computer). What other code do I need to add to accomplish this? I've already added the variables mySqlUser and mySqlPassword to the top of the file.
Here is the code I have so far:
import MySQLdb
class Database:
host = 'some ip address'
user = 'root'
password = '123'
mySqlUser = 'root'
mySqlPassword = 'someotherpassword'
db = 'test'
def __init__(self):
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
def insert(self, query):
try:
self.cursor.execute(query)
self.connection.commit()
except:
self.connection.rollback()
def query(self, query):
cursor = self.connection.cursor( MySQLdb.cursors.DictCursor )
cursor.execute(query)
return cursor.fetchall()
def __del__(self):
self.connection.close()
if __name__ == "__main__":
db = Database()
#CleanUp Operation
del_query = "DELETE FROM basic_python_database"
db.insert(del_query)
# Data Insert into the table
query = """
INSERT INTO basic_python_database
(`name`, `age`)
VALUES
('Mike', 21),
('Michael', 21),
('Imran', 21)
"""
# db.query(query)
db.insert(query)
# Data retrieved from the table
select_query = """
SELECT * FROM basic_python_database
WHERE age = 21
"""
people = db.query(select_query)
for person in people:
print "Found %s " % person['name']
You can Try this:
def __init__(self):
self.host = 'some ip address'
self.user = 'root'
self.password = '123'
self.mySqlUser = 'root'
self.mySqlPassword = 'someotherpassword'
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
or
def __init__(self):
self.connection = MySQLdb.connect(host, user, password, db)
self.cursor = self.connection.cursor()
and you batter transfer parameter when instantiation you class , instead of fixed values in class.
just a suggest and don't mind my english (:

Tweepy not working

So I was just trying to run a tweepy script to collect tweets.
I've setup the database but now I'm running into an error.
Starting...
Started user: user1
Exception in thread Thread-1:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
File "build/bdist.macosx-10.6-intel/egg/tweepy/streaming.py", line 414, in filter
self.body['follow'] = u','.join(follow).encode(encoding)
TypeError: sequence item 0: expected string or Unicode, int found
EDIT: The script is actually using: from urllib import urlencode_noplus but the _noplus is not in the urllib which is why I simply deleted it from the code. Although I suspect this for causing the error..
import tweepy
import threading
import logging
from tweepy.models import Status
from tweepy.utils import import_simplejson
from urllib import urlencode
import json
import re
json = import_simplejson()
class Stream:
def __init__(self, consumer_key, consumer_secret,
key, secret, name):
self.auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
self.auth.set_access_token(key, secret)
self.tweetsBuffer = TweetsBuffer()
self.name = name
self.logger = logging.getLogger('TwitterCollector')
#check credentials
if not tweepy.API(self.auth).verify_credentials():
print "Invalid credentials for user: ",self.name,".\nExiting..."
logging.error("Invalid credentials for user: "+self.name+".\nExiting...")
exit(0)
def run(self, users_list = None):
sl = StreamListener()
sl.init(self.tweetsBuffer)
try:
streamer = tweepy.Stream(auth=self.auth,
listener=sl,
timeout=3000000000,
include_entities=1,
include_rts=1)
#load friends
filter = []
if users_list is None:
filter = tweepy.API(self.auth).friends_ids()
else:
for subList in users_list:
for user in subList['users']:
filter.append(user.id)
#remove duplicates
filter = list(set(filter))
sThread = threading.Thread(target=streamer.filter, args=(filter,))
sThread.start()
return sThread
except Exception, e:
print e
def getTweetsBuffer(self):
return self.tweetsBuffer
def getUserList(self, lists):
if lists is None:
return None
api = tweepy.API(self.auth)
users_list = []
for list in lists:
users = []
members = tweepy.Cursor(
api.list_members,
list['owner'],
list['slug']
).items()
for member in members:
users.append(member)
users_list.append(
{
'owner' : list['owner'],
'slug' : list['slug'],
'users' : users
})
return users_list
class StreamListener(tweepy.StreamListener):
def init(self, tweetsBuffer):
#set buffer
self.tweetsBuffer = tweetsBuffer
def parse_status(self, status, retweet = False):
tweet = {
'tweet_id':status.id,
'tweet_text':status.text,
'created_at':status.created_at,
'geo_lat':status.coordinates['coordinates'][0]
if not status.coordinates is None
else 0,
'geo_long': status.coordinates['coordinates'][1]
if not status.coordinates is None
else 0,
'user_id':status.user.id,
'tweet_url':"http://twitter.com/"+status.user.id_str+"/status/"+status.id_str,
'retweet_count':status.retweet_count,
'original_tweet_id':status.retweeted_status.id
if not retweet and (status.retweet_count > 0)
else 0,
'urls': status.entities['urls'],
'hashtags':status.entities['hashtags'],
'mentions': status.entities['user_mentions']
}
#parse user object
user = {
'user_id':status.user.id,
'screen_name': status.user.screen_name,
'name': status.user.name,
'followers_count': status.user.followers_count,
'friends_count': status.user.friends_count,
'description': status.user.description
if not status.user.description is None
else "N/A",
'image_url': status.user.profile_image_url,
'location': status.user.location
if not status.user.location is None
else "N/A",
'created_at': status.user.created_at
}
return {'tweet':tweet, 'user':user}
def on_data(self, data):
if 'in_reply_to_status_id' in data:
status = Status.parse(self.api, json.loads(data))
if self.on_status(status, data) is False:
return False
elif 'delete' in data:
delete = json.loads(data)['delete']['status']
if self.on_delete(delete['id'], delete['user_id']) is False:
return False
elif 'limit' in data:
if self.on_limit(json.loads(data)['limit']['track']) is False:
return False
def on_status(self, status, rawJsonData):
try:
#parse tweet
tweet = self.parse_status(status)
tweet['raw_json'] = rawJsonData
self.tweetsBuffer.insert(tweet)
#parse retweet
if tweet['tweet']['retweet_count'] > 0:
retweet = self.parse_status(status.retweeted_status, True)
retweet['raw_json'] = None
self.tweetsBuffer.insert(retweet)
except Exception:
# Catch any unicode errors while printing to console
# and just ignore them to avoid breaking application.
pass
class TweetsBuffer():
tweetsBuffer = []
def __init__(self):
self.lock = threading.Lock()
def insert(self, tweet):
self.lock.acquire()
self.tweetsBuffer.append(tweet)
self.lock.release()
def pop(self):
self.lock.acquire()
tweet = self.tweetsBuffer.pop() if len(self.tweetsBuffer) > 0 else None
self.lock.release()
return tweet

Categories

Resources