I'm streaming Twitter data from the API into a Postgres database by modeling this script. Using those exact methods, I'm able to stream the data successfully into the two tables (one containing user_id/user_name, and the other containing data). I've been able to make minor changes to extract a few other bits of information, but using these methods I'm only collecting retweets given a keyword list and I would like to collect all tweets given the list. Based on the way the original script is collecting/storing retweet user_ids and user_names, I changed the code tried to stream into a new table without making any references to retweets. Unfortunately, the result of this were two empty tables. The code ran fine otherwise, and was printing statements to the terminal, there was just no data. Why would this be? Below is my code:
import psycopg2
import tweepy
import json
import numpy as np
# Importing postgres credentials
import postgres_credentials
# Importing twitter credentials
import twitter_credentials
# Accesing twitter from the App created in my account
def autorize_twitter_api():
"""
This function gets the consumer key, consumer secret key, access token
and access token secret given by the app created in your Twitter account
and authenticate them with Tweepy.
"""
# Get access and costumer key and tokens
auth = tweepy.OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)
auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)
return auth
def create_tweets_table(term_to_search):
"""
This function open a connection with an already created database and creates a new table to
store tweets related to a subject specified by the user
"""
# Connect to Twitter Database created in Postgres
conn_twitter = psycopg2.connect(dbname=postgres_credentials.dbname, user=postgres_credentials.user, password=postgres_credentials.password, host=postgres_credentials.host,
port=postgres_credentials.port)
# Create a cursor to perform database operations
cursor_twitter = conn_twitter.cursor()
# with the cursor now, create two tables, users twitter and the corresponding table according to the selected topic
cursor_twitter.execute("CREATE TABLE IF NOT EXISTS test_twitter_users (user_id VARCHAR PRIMARY KEY, user_name VARCHAR);")
query_create = "CREATE TABLE IF NOT EXISTS %s (id SERIAL, created_at_utc timestamp, tweet text NOT NULL, user_id VARCHAR, user_name VARCHAR, PRIMARY KEY(id), FOREIGN KEY(user_id) REFERENCES twitter_users(user_id));" % (
"test_tweet_text")
cursor_twitter.execute(query_create)
# Commit changes
conn_twitter.commit()
# Close cursor and the connection
cursor_twitter.close()
conn_twitter.close()
return
def store_tweets_in_table(term_to_search, created_at_utc, tweet, user_id, user_name):
"""
This function open a connection with an already created database and inserts into corresponding table
tweets related to the selected topic
"""
# Connect to Twitter Database created in Postgres
conn_twitter = psycopg2.connect(dbname=postgres_credentials.dbname, user=postgres_credentials.user, password=postgres_credentials.password, host=postgres_credentials.host,
port=postgres_credentials.port)
# Create a cursor to perform database operations
cursor_twitter = conn_twitter.cursor()
# with the cursor now, insert tweet into table
cursor_twitter.execute(
"INSERT INTO test_twitter_users (user_id, user_name) VALUES (%s, %s) ON CONFLICT(user_id) DO NOTHING;",
(user_id, user_name))
cursor_twitter.execute(
"INSERT INTO %s (created_at_utc, tweet, user_id, user_name) VALUES (%%s, %%s, %%s, %%s);" % (
'test_tweet_text'),
(created_at_utc, tweet, user_id, user_name))
# Commit changes
conn_twitter.commit()
# Close cursor and the connection
cursor_twitter.close()
conn_twitter.close()
return
class MyStreamListener(tweepy.StreamListener):
'''
def on_status(self, status):
print(status.text)
'''
def on_data(self, raw_data):
try:
global term_to_search
data = json.loads(raw_data)
# Obtain all the variables to store in each column
user_id = data['user']['id']
user_name = data['user']['name']
created_at_utc = data['created_at']
tweet = data['text']
# Store them in the corresponding table in the database
store_tweets_in_table(term_to_search, created_at_utc, tweet, user_id, user_name)
except Exception as e:
print(e)
def on_error(self, status_code):
if status_code == 420:
# returning False in on_error disconnects the stream
return False
########################################################################
while True:
if __name__ == "__main__":
# Creates the table for storing the tweets
term_to_search = ["donald trump","trump"]
create_tweets_table(term_to_search)
# Connect to the streaming twitter API
api = tweepy.API(wait_on_rate_limit_notify=True)
# Stream the tweets
try:
streamer = tweepy.Stream(auth=autorize_twitter_api(), listener=MyStreamListener(api=api),tweet_mode='extended')
streamer.filter(track=term_to_search)
except:
continue
What happen if you print the values in this function? do you have values there?
def on_data(self, raw_data):
try:
global term_to_search
data = json.loads(raw_data)
# Obtain all the variables to store in each column
user_id = data['user']['id']
user_name = data['user']['name']
created_at_utc = data['created_at']
tweet = data['text']
# Store them in the corresponding table in the database
store_tweets_in_table(term_to_search, created_at_utc, tweet, user_id, user_name)
except Exception as e:
print(e)
When you print the sql statements, can you see the inserts without data?
I discovered the issue - I was creating two new tables, but inserting data into two different tables.
Related
Requirement: 1. I want to create python API which will help to insert data in big query table and this API will host in swagger/postman, from there user can provide input data so that it will get reflected in big query table.
Can anyone help me to find out suitable solution with code
import sqlite3 as sql
from google.cloud import bigquery
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file('path/to/file.json')
project_id = 'project_id'
client = bigquery.Client(credentials= credentials,project=project_id)
def add_data(group_name, user_name):
try:
# Connecting to database
con = sql.connect('shot_database.db')
# Getting cursor
c = con.cursor()
# Adding data
job_config.use_legacy_sql = True
query_job = client.query("""
INSERT INTO `table_name` (group, user)
VALUES (%s, %s)""",job_config = job_config)
results = query_job.result() # Wait for the job to complete.
# Applying changes
con.commit()
except:
print("An error has occured")
The code you provided is a mix of SQLite and BigQuery, but it likes that you're trying to use BigQuery to insert data into a table. To insert data into a BigQuery table using Python, you can use the insert_data() method of the Client class. Here's I am adding an example of how you can use this method to insert data into a table called "mytable" in a dataset called "mydataset":
# Define the data you want to insert
data = [
{
"group": group_name,
"user": user_name
}
]
# Insert the data
table_id = "mydataset.mytable"
errors = client.insert_data(table_id, data)
if errors == []:
print("Data inserted successfully")
else:
print("Errors occurred while inserting data:")
print(json.dumps(errors, indent=2))
Then, You can create an API using Flask or Django and call the add_data method which you have defined to insert data into big query table.
class StdOutListener(StreamListener):
def on_data(self, data):
all_data = json.loads(data)
Username = all_data["user"]["screen_name"]
Tweet = all_data["text"]
Location = all_data["place"]["name"]
c.execute("INSERT INTO Tweets (Username, Tweet, Location) VALUES (?,?,?)",
(Username, Tweet, Location))
I have set up a StreamListener to follow a specified Twitter account. Whenever a Tweet is posted, the username, tweet and location are retrieved and stored into an sql database. The problem I'm encountering is if no location is present then the program fails. Is there anyway to check a location value is present?
just use a try/except
try:
Location = all_data["place"]["name"]
except KeyError:
Location = "anywhere"
I am making a Telegram bot that can can access database to reply users' query. The bot need to respond to specific request of certain data in database. I was able to solve for when users request for all data but I am stuck with individual data. I am using telegram.ext from telegram package in python. Here is what I have done so far.
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
import MySQLdb
currr = [] # global list var ~don't bash me for using global in python please, I'm a newbie
# request for all data in database
def request2(bot, update):
db = MySQLdb.connect(host = "local", user = "root", passwd = "pwd", db = "mydb")
cur = db.cursor()
cur.execute("select ID from table")
ID = cur.fetchall()
cur.execute("SELECT ID, temp FROM table2 order by indexs desc")
each_rows = cur.fetchall()
for IDs in ID:
for each_row in each_rows:
if str(each_row[0])[0:4]==str(ID)[2:6]:
update.message.reply_text('reply all related data here')
break
# request for single data
def individualreq(bot, update):
db = pymysql.connect(host = "localhost", user = "root", passwd = "pwd", db = "mydb")
update.message.reply_text('reply individual data to users here')
def main():
updater = Updater("TOKEN")
dp = updater.dispatcher
global currr
# get all ID form database
db = MySQLdb.connect(host = "localhost", user = "root", passwd = "pwd", db = "mydb")
cur = db.cursor()
cur.execute("select ID from table")
curr_ID = cur.fetchall()
# example ID = 'F01', 'F02', 'F03'
for curr_IDs in curr_ID:
currr.append(curr_IDs[0])
# request all data
dp.add_handler(CommandHandler("all", request2))
# request individual data
dp.add_handler(CommandHandler(currr, individualreq)) # list command in currr[]
if __name__ == '__main__':
main()
I am looking for a way to pass the current command which is also the ID in database that user request in the currr[] list to the individualreq(bot, update) function so that only data of the called ID is being replied. Users will select from a list of ID in telegram and the command handler can pass the selected ID to the function. I have not found a way to pass the ID to the function. Could someone help me to solve this please. Thanks
I find out a solution for my question from the answer provided by Oluwafemi Sule. CommandHandler can pass the arguments of the command to the function by adding pass_args=True in the CommandHandler.
dp.add_handler(CommandHandler(currr, individualreq, pass_args=True))
To print out the args in the function, the function need to receive the args.
def individualreq(bot, update, args):
# id store the args value
id = update.message.text
print(id[1:]) # [1:] is to get rid of the / in id
You can outright make individualreq a closure.
CommandHandler takes a command or list of command to listen to and a list other options.
There is a pass_user_data option that allows for user data to be passed to the callback.
dp.add_handler(CommandHandler(currr, individualreq, pass_user_data=True))
The signature for individualreq callback will be updated to take the user_data
def individualreq(bot, update, user_data=None):
#user_data is a dict
print(user_data)
I am trying to retrieve the text, longitude, and latitude from a tweepy StreamListener and then store the data in my SQL database. I am able to store the coordinates fine, but for some reason the unicode is not working.
For the SQL I have:
mysql> CREATE TABLE tweets (tweet nvarchar(140), lat float(10,6) not null, lng float(10,6) not null) engine=myisam;
For my python script I have (not including the main()):
import mysql.connector
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from authenticator import Authenticator
import json
#connecting to mysql database
conn = mysql.connector.connect(user='root', password='arlenyu123',host='localhost',database='twitter')
mycursor = conn.cursor()
class MyStreamListener(StreamListener):
def on_status(self, status):
if status.coordinates is not None:
#for some reason the status text isn't being fed into the database properly... not sure why.
mycursor.execute('INSERT INTO tweets (tweet, lat, lng) VALUES ({},{},{})'.
format(status.text, status.coordinates['coordinates'][0], status.coordinates['coordinates'][1]))
return True
def on_error(self, status_code):
if status_code == 403:
print("The request is understood, but it has been refused or access is not allowed. Limit is maybe reached")
return False
Please note that I am a beginner so any advice is appreciated.
You should never use string interpolation to create SQL commands. Use the parameter substitution that is provided by the SQL connector.
mycursor.execute('INSERT INTO tweets (tweet, lat, lng) VALUES (%s,%s,%s)',
(status.text, status.coordinates['coordinates'][0], status.coordinates['coordinates'][1]))
I've been working on trying to populate a table in a PostreSQL database using Tweepy and Twitter's Streaming API. I'm extremely close, I believe I'm just one line away from getting it. I've looked at many examples including:
http://andrewbrobinson.com/2011/07/15/using-tweepy-to-access-the-twitter-stream/
http://blog.creapptives.com/post/14062057061/the-key-value-store-everyone-ignored-postgresql
Python tweepy writing to sqlite3 db
tweepy stream to sqlite database - invalid synatx
Using tweepy to access Twitter's Streaming API
etc, etc
Im at the point where I can stream tweets quite easily using Tweepy, so I know my consumer key, consumer secret, access key and access secret are correct. I also have Postgres set up, and am successfully connecting to the database I created. I tested hard coded values into the table in my database using psycopg2 from a .py file, and that is also working. I am getting tweets streamed in based on keywords I select, and am successfully connected to a table in a database. Now I just need the tweets to stream into the table in my postgres database. Like I said, I am so close and any help would be so greatly appreciated.
This stripped down script inserts data into my desired table:
import psycopg2
try:
conn = psycopg2.connect("dbname=teststreamtweets user=postgres password=x host=localhost")
print "connected"
except:
print "unable to connect"
namedict = (
{"first_name":"Joshua", "last_name":"Drake"},
{"first_name":"Steven", "last_name":"Foo"},
{"first_name":"David", "last_name":"Bar"}
)
cur = conn.cursor()
cur.executemany("""INSERT INTO testdata(first_name, last_name) VALUES (%(first_name)s, %(last_name)s)""", namedict);
conn.commit()
Below is the script I have been editing for a while now trying to get it to work:
import psycopg2
import time
import json
from getpass import getpass
import tweepy
consumer_key = 'x'
consumer_secret = 'x'
access_key = 'x'
access_secret = 'x'
connection = psycopg2.connect("dbname=teststreamtweets user=postgres password=x host=localhost")
cursor = connection.cursor()
#always use this step to begin clean
def reset_cursor():
cursor = connection.cursor()
class StreamWatcherListener(tweepy.StreamListener):
def on_data(self, data):
try:
print 'before cursor' + data
connection = psycopg2.connect("dbname=teststreamtweets user=postgres password=x host=localhost")
cur = connection.cursor()
print 'status is: ' + str(connection.status)
#cur.execute("INSERT INTO tweet_list VALUES (%s)" % (data.text))
cur.executemany("""INSERT INTO tweets(tweet) VALUES (%(text)s)""", data);
connection.commit()
print '---------'
print type(data)
#print data
except Exception as e:
connection.rollback()
reset_cursor()
print "not saving"
return
if cursor.lastrowid == None:
print "Unable to save"
def on_error(self, status_code):
print 'Error code = %s' % status_code
return True
def on_timeout(self):
print 'timed out.....'
print 'welcome'
auth1 = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth1.set_access_token(access_key, access_secret)
api = tweepy.API(auth1)
l = StreamWatcherListener()
print 'about to stream'
stream = tweepy.Stream(auth = auth1, listener = l)
setTerms = ['microsoft']
#stream.sample()
stream.filter(track = setTerms)
Sorry if it's a bit messy of code, but have been trying many options. Like I said any suggestions, links to helpful examples, etc would be greatly appreciated as I've tried everything I can think of and am now resorting to a long walk. Thanks a ton.
Well, I'm not sure why you are using classes for this, and then why you don't have __init__ defined in your class. Seems complicated.
Here is a basic version of the functions I use to do this stuff. I've only ever used sqlite for it, but the syntax looks basically the same. Maybe you can get something from this.
def retrieve_tweets(numtweets=10, *args):
"""
This function optionally takes one or more arguments as keywords to filter tweets.
It iterates through tweets from the stream that meet the given criteria and sends them
to the database population function on a per-instance basis, so as to avoid disaster
if the stream is disconnected.
Both SampleStream and FilterStream methods access Twitter's stream of status elements.
"""
filters = []
for key in args:
filters.append(str(key))
if len(filters) == 0:
stream = tweetstream.SampleStream(username, password)
else:
stream = tweetstream.FilterStream(username, password, track=filters)
try:
count = 0
while count < numtweets:
for tweet in stream:
# a check is needed on text as some "tweets" are actually just API operations
# the language selection doesn't really work but it's better than nothing(?)
if tweet.get('text') and tweet['user']['lang'] == 'en':
if tweet['retweet_count'] == 0:
# bundle up the features I want and send them to the db population function
bundle = (tweet['id'], tweet['user']['screen_name'], tweet['retweet_count'], tweet['text'])
db_initpop(bundle)
break
else:
# a RT has a different structure. This bundles the original tweet. Getting the
# retweets comes later, after the stream is de-accessed.
bundle = (tweet['retweeted_status']['id'], tweet['retweeted_status']['user']['screen_name'], \
tweet['retweet_count'], tweet['retweeted_status']['text'])
db_initpop(bundle)
break
count += 1
except tweetstream.ConnectionError, e:
print 'Disconnected from Twitter at '+time.strftime("%d %b %Y %H:%M:%S", time.localtime()) \
+'. Reason: ', e.reason
def db_initpop(bundle):
"""
This function places basic tweet features in the database. Note the placeholder values:
these can act as a check to verify that no further expansion was available for that method.
"""
#unpack the bundle
tweet_id, user_sn, retweet_count, tweet_text = bundle
curs.execute("""INSERT INTO tblTweets VALUES (null,?,?,?,?,?,?)""", \
(tweet_id, user_sn, retweet_count, tweet_text, 'cleaned text', 'cleaned retweet text'))
conn.commit()
print 'Database populated with tweet '+str(tweet_id)+' at '+time.strftime("%d %b %Y %H:%M:%S", time.localtime())
Good luck!