Not able to update MYSQL db with python - python

Issue: The script runs successfully without any errors but Mysql database won't get updated with script results
I have added line db.autocommit(True) to commit every time but still, it fails.
Env: python2.7, MySQL
I have also tried to manually enter db.commit() after each executes the statement with locks but then also it fails.
'''
Specifications:
A multi-Threaded Web Spider that:
Takes website and depth of spidering as input
Downloads the HTML files only
Inserts the HTML into an MYSQL database
It also parses the forms on each page and inserts into db with form details
'''
import mechanize
import sys
import threading
import MySQLdb
lock = threading.Lock()
def Parse_Forms(target,curr,br):
lock.acquire()
br.open(target)
curr.execute("use web;");
response = []
for forms in br.forms():
i= 0
action = forms.action
method = forms.method
d = dict()
d['method'] = method
d['name'] = action
br.select_form(nr=i)
for control in forms.controls:
if control.value == '':
d[control.name] = "NULL"
elif type(control.value) is list:
d[control.name] = control.value[0]
else:
d[control.name] = control.value
for j in d:
if str(j) == 'login' or str(j) == 'name' or str(j) == 'password' or str(j) == 'method': #These are only the valid names that has to be inserted in MYSQL db
query = "INSERT INTO `forms` ("+str(j)+") values (\""+str(d[j])+"\");"
curr.execute(query)
print "Query Executed!"
i=i+1
response.append(br.submit())
lock.release()
def getHTMLfiles(target,curr):
br = mechanize.Browser()
headers = [('User-Agent','Firefoxy'),]
br.addheaders = headers
br.open(target)
for i in range(0,depth):
for link in br.links():
if ".hmtl" in link.url:
print "Downloading File: "+link.url
os.system("wget "+link.url+" -P Files/")
curr.execute("INSERT INTO `pages` (name) values ("+ "\"link.url\");")
if link.url[0] == '/' and not '.' in link.url: #Indicates that file belongs to server not some external link and is a directory
Parse_Forms(target+link.url,curr,br,db)
if __name__ == "__main__":
db = MySQLdb.connect(host="localhost",user="****",password="*****",db="web")
#There are 2 db one called pages that saves HTML file url and one forms that saves form parameteres
db.autocommit(True)
curr = db.cursor()
target = sys.argv[1]
depth = int(sys.argv[2])
threads = []
for workers in range(10):
t = threading.Thread(target = getHTMLfiles,args = (target,curr,))
t.daemon = True
t.start()
threads.append(t)
for thread in threads:
thread.join()
The script works fine but it is supposed to update the MySQL database.
Note: There are no MYSQL errors(I mean literally no errors) to everything seems to run fine

Related

SQLAlchemy / Celery: Sqlalchemy error: is already attached to session '12' (this is' 5 ')

I'm trying to aggregate News RSS Feeds with celery but im getting this error from SQLAlchemy.
Sqlalchemy error: is already attached to session '12' (this is' 5 ')
Im runing 3 celery workers like thi:
celery -A main.celery worker --loglevel=INFO -P gevent --concurrency=50 -n celeryworker1#%%h
From reading documention this how i think u need to setup SQLAlchemy for mutithreading with Scoped Session.
Databse Settings:
engine = create_engine(SQLALCHEMY_DATABASE_URL,pool_size=200, max_overflow=250)
SessionLocal = sessionmaker(bind=engine,autocommit=False, autoflush=False)
Session_t = scoped_session(SessionLocal)
Session Settings
#contextmanager
def session_scope():
session = Session_t()
try:
yield session
finally:
Session_t.remove()
The Problem is when i create News_Article_Tag object, the problem is so news articles have same tages and are created at the same time beacause of the multithreading and concurrency.
for tag in article["tags"]:
tag_query = db.query(models.News_Article_Tag).filter_by(country = newspaper.country,language = newspaper.language, tag = tag).first()
if not tag_query:
new_tag = models.News_Article_Tag(
country = newspaper.country,
language = newspaper.language,
tag = tag,
popularity = 1
)
new_article_tags.append(new_tag)
article_tags.append(new_tag)
else:
tag_query.popularity = tag_query.popularity + 1
article_tags.append(tag_query)
new_article.tags.extend(article_tags)
db.add(new_article)
db.add_all(new_article_tags)
db.commit()
I try it to solved by checking if object is in session like this.
for tag in article["tags"]:
tag_query = db.query(models.News_Article_Tag).filter_by(country = newspaper.country,language = newspaper.language, tag = tag).first()
if not tag_query:
new_tag = models.News_Article_Tag(
country = newspaper.country,
language = newspaper.language,
tag = tag,
popularity = 1
)
article_tags.append(new_tag)
if new_tag not in session:
session.add(new_tag)
session.commit()
else:
tag_query.popularity = tag_query.popularity + 1
article_tags.append(tag_query)
new_article.tags.extend(article_tags)
session.add(new_article)
session.commit()
But Still does not work, i think it checks just in current session and not sessions in other threads.
Does someone has an idea how to solve it ?
I don't know is this what i looking for (ScopedRegistry) ?
https://docs.sqlalchemy.org/en/14/orm/contextual.html#sqlalchemy.util.ScopedRegistry

How query asynchronous postgres with aws lambda python?

In my case I use the pycopg2 client and I need to create a table but it gives me a time out error, this is obviously because the table takes a long time and exceeds the 15 min limit.
For my little purposes I found the following documentation that helped me a lot psycopg doc
I will leave the small implementation, note that I have separated the connection as aconn because it works differently than the normal connection, for example it does not use commit
The little detail is async_ =True in the connection line
import select
import psycopg2
def wait(conn):
while True:
state = conn.poll()
if state == psycopg2.extensions.POLL_OK:
break
elif state == psycopg2.extensions.POLL_WRITE:
select.select([], [conn.fileno()], [])
elif state == psycopg2.extensions.POLL_READ:
select.select([conn.fileno()], [], [])
else:
raise psycopg2.OperationalError("poll() returned %s" % state)
db_host = db_secret["host"]
db_name = db_secret["dbname"]
db_user = db_secret["username"]
db_pass = db_secret["password"]
aconn = None
stringConn = "dbname='%s' user='%s' host='%s' password='%s'" % (db_name, db_user, db_host, db_pass)
aconn = psycopg2.connect(stringConn , async_ =True)
wait(aconn)
acursor = aconn.cursor()
query ="CREATE TABLE CHEMA.TABLE AS SELECT * FRO BLA "
acursor.execute(query, params={})
wait(acursor.connection)
aconn.close()
#END AND EXIT

Process very large 900M row MySQL table line by line with Python

I often need to process several hundred million rows of a MySQL table on a line by line basis using Python. I want a script that is robust and does not need to be monitored.
Below I pasted a script that classifying the language of the message field in my row. It utilizes the sqlalchemy and MySQLdb.cursors.SSCursor modules. Unfortunately this script consistently throws a 'Lost connection to MySQL server during query' error after 4840 rows when I run remotely and 42000 rows when I run locally.
Also, I have checked and max_allowed_packet = 32M on my MySQL server's /etc/mysql/my.cnf file as per the answers to this stackoverflow question Lost connection to MySQL server during query
Any advice for either fixing this error, or using another approach to use Python for processing very large MySQL files in a robust way would be much appreciated!
import sqlalchemy
import MySQLdb.cursors
import langid
schema = "twitterstuff"
table = "messages_en" #900M row table
engine_url = "mysql://myserver/{}?charset=utf8mb4&read_default_file=~/.my.cnf".format(schema)
db_eng = sqlalchemy.create_engine(engine_url, connect_args={'cursorclass': MySQLdb.cursors.SSCursor} )
langid.set_languages(['fr', 'de'])
print "Executing input query..."
data_iter = db_eng.execute("SELECT message_id, message FROM {} WHERE langid_lang IS NULL LIMIT 10000".format(table))
def process(inp_iter):
for item in inp_iter:
item = dict(item)
(item['langid_lang'], item['langid_conf']) = langid.classify(item['message'])
yield item
def update_table(update_iter):
count = 0;
for item in update_iter:
count += 1;
if count%10 == 0:
print "{} rows processed".format(count)
lang = item['langid_lang']
conf = item['langid_conf']
message_id = item['message_id']
db_eng.execute("UPDATE {} SET langid_lang = '{}', langid_conf = {} WHERE message_id = {}".format(table, lang, conf, message_id))
data_iter_upd = process(data_iter)
print "Begin processing..."
update_table(data_iter_upd)
According to MySQLdb developer Andy Dustman,
[When using SSCursor,] no new queries can be issued on the connection until
the entire result set has been fetched.
That post says that if you issue another query you will get a "commands out of sequence" error, which is not the error you are seeing. So I am not sure that the following will necessarily fix your problem. Nevertheless, it might be worth trying to remove SSCursor from your code and use the simpler default Cursor just to test if that is the source of the problem.
You could, for example, use LIMIT chunksize OFFSET n in your SELECT statement
to loop through the data set in chunks:
import sqlalchemy
import MySQLdb.cursors
import langid
import itertools as IT
chunksize = 1000
def process(inp_iter):
for item in inp_iter:
item = dict(item)
(item['langid_lang'], item['langid_conf']) = langid.classify(item['message'])
yield item
def update_table(update_iter, engine):
for count, item in enumerate(update_iter):
if count%10 == 0:
print "{} rows processed".format(count)
lang = item['langid_lang']
conf = item['langid_conf']
message_id = item['message_id']
engine.execute(
"UPDATE {} SET langid_lang = '{}', langid_conf = {} WHERE message_id = {}"
.format(table, lang, conf, message_id))
schema = "twitterstuff"
table = "messages_en" #900M row table
engine_url = ("mysql://myserver/{}?charset=utf8mb4&read_default_file=~/.my.cnf"
.format(schema))
db_eng = sqlalchemy.create_engine(engine_url)
langid.set_languages(['fr', 'de'])
for offset in IT.count(start=0, step=chunksize):
print "Executing input query..."
result = db_eng.execute(
"SELECT message_id, message FROM {} WHERE langid_lang IS NULL LIMIT {} OFFSET {}"
.format(table, chunksize, offset))
result = list(result)
if not result: break
data_iter_upd = process(result)
print "Begin processing..."
update_table(data_iter_upd, db_eng)

Output table contents with limit and filter

I'm finding the boto dynamoDB documentation lacking almost completely of examples.
In Python, I simply want to output the contents of a table with a limit of a number of records, say 500 of the latest ones, from a certain date.
Here is what I have...
import boto.dynamodb
import sys
#----------PUBLIC VARIABLES--------------------------#
connection = boto.dynamodb.connect_to_region(
'us-east-1',
aws_access_key_id='somekey',
aws_secret_access_key='somesecretkey')
#----------------------------------------------------#
def info():
print('#########################_TABLE_NAMES_#########################')
#get and print list of tables
tablenames = connection.list_tables()
for table in tablenames:
print('DynamoDB table: %s' % table)
#print(connection.describe_table(table))
print('###############################################################' + '\n')
def main():
print('###########################_RESULTS_###########################')
scan = myTable.scan(scan_filter=None, attributes_to_get=['SomeField'])
results = []
for x in scan:
results.append(x['SomeField'])
print('###############################################################' + '\n')
def writeError(error):
try:
f = open("error.txt", "w")
try:
f.write(error) # Write a string to a file
finally:
f.close()
except IOError:
print "WriteError - Error!"
if __name__ == '__main__':
try:
info()
main()
except:
writeError("Unexpected error:" + str(sys.exc_info()))
print "Error"
The table I have hasn't got any custom indexes so I'd be looking for something pretty basic as an example.
I'm sorry I don't have a better attempt, but I've researched and not found a lot to go on.
I've modified your script to print out the first 500 scan results for each table. Don't forget to correct the field name (I put someField):
import boto.dynamodb2
from boto.dynamodb2.table import Table
import sys
#----------PUBLIC VARIABLES--------------------------#
connection = boto.dynamodb2.connect_to_region(
'us-east-1')
#----------------------------------------------------#
def getTableNames():
'''get list of tables'''
tablenames = connection.list_tables()["TableNames"]
return tablenames
def main(tablenames=[]):
print('###########################_RESULTS_###########################')
for table in tablenames:
print "Table Name: " + table
myTable = Table(table)
scan = myTable.scan()
results = []
for item in scan:
if len(results) >= 500:
break
results.append(item.get('someField'))
for result in results:
print result
print('###############################################################' + '\n')
def writeError(error):
try:
f = open("error.txt", "w")
try:
f.write(error) # Write a string to a file
finally:
f.close()
except IOError:
print "WriteError - Error!"
if __name__ == '__main__':
try:
tablenames = getTableNames()
main(tablenames)
except:
writeError("Unexpected error:" + str(sys.exc_info()))
print "Error"
Please note that DynamoDB doesn't provide scan results in any order. If you want them ordered by the latest changes, you can use a solution based on DynamoDB Streams https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html or add a secondary index: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html

Python script works but fails after compilation (Windows)

I am working on a script to scrape a website, the problem is that it works normally when I run it with the interpreter, however after compiling it (PyInstaller or Py2exe) it fails, it appears to be that mechanize / requests both fail to keep the session alive.
I have hidden my username and password here, but I did put them correctly in the compiled code
import requests
from bs4 import BeautifulSoup as bs
from sys import argv
import re
import logging
url = argv[1]
payload = {"userName": "real_username", "password": "realpassword"}
session = requests.session()
resp = session.post("http://website.net/login.do", data=payload)
if "forgot" in resp.content:
logging.error("Login failed")
exit()
resp = session.get(url)
soup = bs(resp.content)
urlM = url[:url.find("?") + 1] + "page=(PLACEHOLDER)&" + \
url[url.find("?") + 1:]
# Get number of pages
regex = re.compile("\|.*\|\sof\s(\d+)")
script = str(soup.findAll("script")[1])
epNum = int(re.findall(regex, script)[0]) # Number of EPs
pagesNum = epNum // 50
links = []
# Get list of links
# If number of EPs > 50, more than one page
if pagesNum == 0:
links = [url]
else:
for i in range(1, pagesNum + 2):
url = urlM.replace("(PLACEHOLDER)", str(i))
links.append(url)
# Loop over the links and extract info: ID, NAME, START_DATE, END_DATE
raw_info = []
for pos, link in enumerate(links):
print "Processing page %d" % (pos + 1)
sp = bs(session.get(link).content)
table = sp.table.table
raw_info.extend(table.findAll("td"))
epURL = "http://www.website.net/exchange/viewep.do?operation"\
"=executeAction&epId="
# Final data extraction
raw_info = map(str, raw_info)
ids = [re.findall("\d+", i)[0] for i in raw_info[::4]]
names = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[1::4]]
start_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[2::4]]
end_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[3::4]]
emails = []
eplinks = [epURL + str(i) for i in ids]
print names
The error happens on the level of epNum variable, this means as I figured that the HTML page is not the one I requested, but it works normally on linux script and compiled, work on widows as script but fails when compiled.
The py2exe tutorial mentions that you need MSVCR90.dll, did you check its present on the PC?

Categories

Resources