When a single hash is found in the tweet URL , The script inserts the values into MYSQL DB properly . When there is 2 or more hash found in the tweet URL , The records are inserted twice in the MYSQL DB.
For example , If a tweet has 2 URLS with hashes mentioned , In MYSQL DB 4 records are created.
DB State:
"https://www.virustotal.com/en/file/2819e520dea611c4dd1c3b1fd54adbd0c50963ff75d67cc7facbe2090574afc0/analysis/","2017-09-20 01:00:35","2819e520dea611c4dd1c3b1fd54adbd0c50963ff75d67cc7facbe2090574afc0"
"https://www.virustotal.com/en/file/8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89/analysis/","2017-09-20 01:03:35","8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89"
"https://www.virustotal.com/en/file/b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0/analysis/","2017-09-20 01:03:35","8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89"
"https://www.virustotal.com/en/file/8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89/analysis/","2017-09-20 01:03:35","b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0"
"https://www.virustotal.com/en/file/b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0/analysis/","2017-09-20 01:03:35","b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0"
Any suggestions on how to insert only single entries to DB ?
#! /usr/bin/python
from __future__ import print_function
import tweepy
import json
import MySQLdb
import time
import json, urllib, urllib2, argparse, hashlib, re, sys
from dateutil import parser
WORDS = ['virustotal']
CONSUMER_KEY = "XXXX"
CONSUMER_SECRET = "YYY"
ACCESS_TOKEN = "AAAA"
ACCESS_TOKEN_SECRET = "DDDDD"
HOST = "192.168.150.1"
USER = "admin"
PASSWD = "admin"
DATABASE = "twitter"
def store_data(values, insert_time, insert_hash):
db=MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DATABASE, charset="utf8")
cursor = db.cursor()
data = []
#print(hashes)
for value in values:
data.append((value, insert_time, insert_hash))
cursor.executemany("""INSERT INTO tweet_url VALUES (%s,%s,%s)""",data)
db.commit()
cursor.close()
db.close()
return
class StreamListener(tweepy.StreamListener):
def on_connect(self):
print("We are now connected to the streaming API.")
def on_error(self, status_code):
print('An Error has occured: ' + repr(status_code))
return False
def on_data(self, data):
try:
datajson = json.loads(data)
web_url= datajson['entities']['urls']
#print(web_url)
urls=[]
for i in web_url:
urls.append((i['expanded_url']))
values = [list([item]) for item in urls]
list_url = ','.join([str(i) for i in values])
extract_url=str(list_url)
formatted_url=''.join(extract_url)
sha256_hash=re.findall(r"([a-fA-F\d]{64})", formatted_url)
hashes=''.join(sha256_hash)
insert_time=time.strftime('%Y-%m-%d %H:%M:%S')
hash_list=re.findall(r"([a-fA-F\d]{64})", hashes)
for insert_hash in hash_list:
store_data(values, insert_time, insert_hash)
print(store_data)
print(hashes)
print(type(hashes))
except Exception as e:
print(e)
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True))
streamer = tweepy.Stream(auth=auth, listener=listener)
print("Tracking: " + str(WORDS))
streamer.filter(track=WORDS)
You have a first loop :
for insert_hash in hash_list:
store_data(values, insert_time, insert_hash)
And then you loop again on the values to build the data list of tuples :
for value in values:
data.append((value, insert_time, insert_hash))
So the values are called twice.
Maybe you could use zip() or enumerate() to join the hash_list and the values before calling store_data ?
data = []
if len(values) == len(hash_list):
for val,hash in zip(values, hash_list):
data.append((val, insert_time, hash))
store_data(data)
And then, no need to loop again inside store_data(), just change the signature to pass directly the data list:
def store_data(data_list):
# connection to database
cursor.executemany("""INSERT INTO tweet_url VALUES (%s,%s,%s)""",data_list)
Related
In Code Sample #1 I am running a successful query that returns the the viewer percentage of different age groups from a user's youtube account. My ultimate goal is to run multiple queries at the same time and return the results for those queries. Code Sample #2 is my example of implementation for this method but Google then forces two OAuth login pages instead of using the credentials from just one to address the queries. What is the best implementation to accomplish this?
Code Sample #1
import os
import json
import pandas as pd
from pandas import json_normalize
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
from collections import OrderedDict
from tabulate import tabulate
import mysql.connector
from mysql.connector import errorcode
creator_ID = input("Creator ID: ")
try:
db2 = mysql.connector.connect(user = 'admin',
password = 'abcdefg',
host = 'database',
database = 'app')
if db2.is_connected():
db_Info = db2.get_server_info()
print("Connected to MySQL Server version ", db_Info)
cursor = db2.cursor()
cursor.execute("select database();")
record = cursor.fetchone()
print("You're connected to database: ", record)
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
else:
print("Successfully connect to database!")
cursor = db2.cursor()
try:
cursor.execute("""__""", list(creator_ID))
ig_long_lived_token = cursor.fetchall()[0]
print("Successfully get long lived token!")
except mysql.connector.IntegrityError as err:
print("Error: {}".format(err))
SCOPES=['https://www.googleapis.com/auth/yt-analytics.readonly']
API_SERVICE_NAME = 'youtubeAnalytics'
API_VERSION = 'v2'
CLIENT_SECRETS_FILE = 'client_secrets3.json'
def get_service():
flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
credentials = flow.run_local_server(port=8080, prompt="consent", authorization_prompt_message="")
return build(API_SERVICE_NAME, API_VERSION, credentials=credentials)
def execute_api_request(client_library_function, **kwargs):
response = client_library_function(
**kwargs
).execute()
print(response)
return response
def create_table(table, headers=None):
if headers:
headerstring = "\t{}\t" * len(headers)
print(headerstring.format(*headers))
rowstring = "\t{}\t" * len(table[0])
for row in table:
print(rowstring.format(*row))
if __name__ == '__main__':
youtubeAnalytics = get_service()
result = execute_api_request(
youtubeAnalytics.reports().query,
ids='channel==MINE',
startDate='2005-05-01',
endDate='2023-01-01',
dimensions='ageGroup',
metrics='viewerPercentage'
)
headers = ['viewerPercentage']
print(tabulate(result['rows'], headers=headers, tablefmt='pretty'))
-----------------------------------------------------------------------------------------
Code Sample #2
def ageGroup_fetch():
youtubeAnalytics = get_service()
result = execute_api_request(
youtubeAnalytics.reports().query,
ids='channel==MINE',
startDate='2005-05-01',
endDate='2023-01-01',
dimensions='ageGroup',
metrics='viewerPercentage'
)
def gender_fetch():
youtubeAnalytics = get_service()
result = execute_api_request(
youtubeAnalytics.reports().query,
ids='channel==MINE',
startDate='2005-05-01',
endDate='2023-01-01',
dimensions='gender',
metrics='viewerPercentage'
)
if __name__ == '__main__':
ageGroup_fetch()
gender_fetch()
I have a python AWS lambda function that takes JSON records, checks them to see if they have required keys, and then inserts into a MySQL db (AWS RDS Aurora). The function gets invoked whenever a new record comes into the stream def handler.
At the moment, Lambda is reporting some errors, but when I look at cloudwatch logs I don't see any errors, which leads me to believe that maybe I'm not handling or catching the exception. Can anyone tell me where the issue might be?
from __future__ import print_function
import base64
import json
import pymysql
RDS_HOST = 'host'
DB_USER = 'dummy_user'
DB_PASSWORD = 'password1234'
DB_NAME = 'crazy_name'
DB_TABLE = 'wow_table'
class MYSQL(object):
'''
This a wrapper Class for PyMySQL
'''
CONNECTION_TIMEOUT = 30
def __init__(self, host, user, password, database, table):
self.host = host
self.user = user
self.password = password
self.database = database
self.table = table
self.connection = self.connect()
def connect(self):
'''
Connects to MySQL instance
'''
try:
connection = pymysql.connect(
host=self.host,
user=self.user,
password=self.password,
db=self.database,
connect_timeout=self.CONNECTION_TIMEOUT
)
return connection
except Exception as ex:
print(ex)
print("ERROR: Unexpected error: Could not connect to AuroraDB instance")
def execute(self, account_id, external_ref_id, timestamp):
'''
Executes command given a MySQL connection
'''
with self.connection.cursor() as cursor:
sql = ('INSERT INTO ' +
self.database +
'.' +
self.table +
'(`account_id`, `external_reference_id`, `registration`, `c_name`, `c_id`, `create_date`)' +
' VALUES (%s, %s, DATE_FORMAT(STR_TO_DATE(%s,"%%Y-%%M-%%d %%H:%%i:%%s"),"%%Y-%%m-%%d %%H:%%i:%%s"), %s, %s, current_timestamp())' +
' ON DUPLICATE KEY UPDATE create_date = VALUES(create_date)')
cursor.execute(sql, (
account_id,
external_ref_id,
timestamp,
'bingo',
300)
)
self.connection.commit()
def close_connection(self):
'''
Closes connection to MySQL
'''
self.connection.close()
def get_data_from_kinesis_object(obj):
'''
Retrieves data from kinesis event
'''
return obj['kinesis']['data']
def decode_data(data):
'''
Decodes record via base64
'''
return base64.b64decode(data)
def split_records_into_record(records):
'''
Splits a record of records into an array of records
'''
return records.split('\n')
def parse_record(record):
'''
parses record into JSON
'''
if record:
return json.loads(record)
def is_record_valid(record):
'''
Check for keys in event
returns True if they all exist
and False if they dont all exist
'''
return all(key in record for key in (
'eventName',
'sourceType',
'AccountId',
'Timestamp',
'ExternalReferenceId'
))
def handler(event, context):
"""
This function inserts data into Aurora RDS instance
"""
mysql = MYSQL(RDS_HOST, DB_USER, DB_PASSWORD, DB_NAME, DB_TABLE)
for obj in event['Records']:
records = decode_data(get_data_from_kinesis_object(obj))
split_records = split_records_into_record(records)
for record in split_records:
parsed_record = parse_record(record)
if is_record_valid(parsed_record):
mysql.execute(
parsed_record['AccountId'],
parsed_record['ExternalReferenceId'],
str(parsed_record['Timestamp'])
)
mysql.close_connection()
Mind that I am new to flask and python for that matter, I appreciate any help that anyone gives. I'm looking to access one of the fields of my JSON response(just the field not the entire response), how should I go about parsing the response. Image of the response attached below,thanks.
This is my main thread
from flask import Flask,render_template,request
from Qhandler import Qhandler
from MakePlayer import MakePlayer
app = Flask(__name__)
#app.route('/createplayer',methods=['GET','POST'] )
def showCreatePlayer():
if request.method == 'POST':
MakePlayer(request.form['playername'],request.form['playerteam'],request.form['playerrole'], request.form['playerpos'])
return "created player: <br>"+request.form['playername']+" "+request.form['playerteam']+" "+request.form['playerrole']+" "+request.form['playerpos']
return render_template("createPlayer.html")
#app.route('/sucess')
def success():
return "success"
#app.route('/showplayers')
def showPlayers():
Q = Qhandler()
return Q.displayQuery(""" select * from Player""")
if __name__ == '__main__':
app.run(debug=True)
This is my query handler
from flask import Flask, jsonify, json
from flaskext.mysql import MySQL
class Qhandler(object):
#global mysql
global cursor
global connection
global mysql
# database connection
app = Flask(__name__)
mysql = MySQL()
app.config['MYSQL_DATABASE_USER'] = 'root'
app.config['MYSQL_DATABASE_PASSWORD'] = 'root'
app.config['MYSQL_DATABASE_DB'] = 'Optimizer'
app.config['MYSQL_DATABASE_HOST'] = 'localhost'
mysql.init_app(app)
def ins(self,query):
try:
connection=mysql.connect()
cursor = connection.cursor()
cursor.execute(query)
connection.commit()
except:
print "error running query"
finally:
#cursor.close()
connection.close()
def displayQuery(self,query):
try:
connection = mysql.connect()
cursor = connection.cursor()
cursor.execute(query)
fetchedData = cursor.fetchall()
fetchedData = jsonify(fetchedData)
#fetchedData = json.dumps(fetchedData)
#record = json.loads(fetchedData)
#print "the resonse is here:"
return fetchedData
except:
print "error running query"
finally:
#cursor.close()
connection.close()
current response is
screenshot of results
Use "fetchedData = json.dumps(fetchedData)" instead of "fetchedData = jsonify(fetchedData)" then create a json decoder and parse the response, refer to below :
def displayQuery(self,query):
try:
connection = mysql.connect()
cursor = connection.cursor()
cursor.execute(query)
fetchedData = cursor.fetchall()
fetchedData = json.dumps(fetchedData)
#create a json decoder
d = json.JSONDecoder()
fieldPlayerName = d.decode(fetchedData)
#parse the json that is returned ( fieldPlayerName[0][1])
print "should print the field with the player name",fieldPlayerName[0][1]
return fieldPlayerName[0][1]
I am super new to Python so forgive me for my lack of knowledge haha but for some reason I cannot get Python to insert rows in my database. Here is what I have:
import sys, arcpy, datetime, tweepy
consumer_key = " "
consumer_secret = " "
access_token = " "
access_token_secret = " "
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
table = r"C:\....dbf"
rows = arcpy.InsertCursor(table)
class CustomStreamListener(tweepy.StreamListener):
def on_status(self, status):
try:
user = status.user.screen_name
tweet = status.text
coord_x = status.coordinates['coordinates'][0]
coord_y = status.coordinates['coordinates'][1]
date_utc = status.created_at
h_m_s_utc = (str(status.created_at.hour))+':'+(str(status.created_at.minute))+':'+(str(status.created_at.second))
date_est = datetime.datetime.now()
h_m_s_est = (str(date_est.hour))+':'+(str(date_est.minute))+':'+(str(date_est.second))
row.user_name=user
row.tweet=tweet
row.coord_x=coord_x
row.coord_y=coord_y
row.date_utc=date_utc
row.h_m_s_utc=h_m_s_utc
row.date_est=date_est
rows.insertRow(row)
del row, rows
insert_table= r"C:\....dbf"
insert_row(insert_table)
print user
print tweet
except:
# If there are no coordinates for a tweet, then pass
pass
def on_error(self, status_code):
print >> sys.stderr, 'Encountered error with status code:', status_code
return True # Don't kill the stream
def on_timeout(self):
print >> sys.stderr, 'Timeout...'
return True # Don't kill the stream
# ----------------Script execution----------------
listener = tweepy.streaming.Stream(auth, CustomStreamListener())
listener.filter(track=[' love ', '#love'])
I am pretty sure it has something to do with the row.rowID thing.
Sorry if it is a disaster! Any help is much appreciated!
I looks like you're forgetting to call the data access (.da) method for the insert cursor.
with arcpy.da.InsertCursor(in_table, field_names) as inCursor:
for row in rows:
inCursor.insertRow(row) # example
-or-
inCursor = arcpy.da.InsertCursor(in_table, field_names)
for row in rows:
cursor.insertRow(row) # example
del inCursor # make sure to delete cursor if you do it this way as to avoid data lock.
Also, if you just want the Insert Cursor method, you can
from arcpy import da
For more info, check out:
http://resources.arcgis.com/en/help/main/10.2/index.html#//018w0000000t000000
I am using the following python code to get tweets for a particular topic
import sys
from tweepy import *
import time
import csv
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
class listener(StreamListener):
def on_data(self,data):
try:
saveFile=open('tweetDB2.csv','a')
saveFile.write(data)
saveFile.write('\n')
saveFile.close()
return True
except BaseException as e:
print('failed ondata,',str(e))
time.sleep(60)
def on_error(self,status):
print(status)
auth = OAuthHandler(CONSUMER_KEY,CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN,OAUTH_TOKEN_SECRET)
twitterStream = Stream(auth,listener())
twitterStream.filter(track=["IPL"])
How do I modify the code to get tweets for the same topic but for a different time period (say 2nd week of April,2015)? I went through the API parameters(https://dev.twitter.com/streaming/overview/request-parameters).But I could not find anything with respect to time period. Thanks!