I have multiple threads that process data and puts it on a queue, and a single thread that takes data from a queue and then saves it to a database.
I think the following will cause a memory leak:
class DBThread(threading.Thread):
def __init__(self, myqueue):
threading.Thread.__init__(self)
self.myqueue = myqueue
def run(self):
conn = sqlite3.connect("test.db")
c = conn.cursor()
while True:
data = myqueue.get()
if data:
c.execute("INSERT INTO test (data) VALUES (?)", (data,))
conn.commit()
self.myqueue.task_done()
#conn.close() <--- never reaches this point
q = Queue.Queue()
# Create other threads
....
# Create DB thread
t = DBThread(q)
t.setDaemon(True)
t.start()
q.join()
I can't put the conn.close() in the while loop, because I think that will close the connection on the first loop. I can't put it in the if data: statement, because then it won't save data that may be put in the queue later.
Where do I close the db connection? If I don't close it, won't this cause a memory leak?
If you can use a sentinel value that will not appear in your normal data, e.g. None, you can signal the thread to stop and close the database connection in a finally clause:
import threading
import Queue
import sqlite3
class DBThread(threading.Thread):
def __init__(self, myqueue, db_path):
threading.Thread.__init__(self)
self.myqueue = myqueue
self.db_path = db_path
def run(self):
conn = sqlite3.connect(self.db_path)
try:
while True:
data = self.myqueue.get()
if data is None: # check for sentinel value
break
with conn:
conn.execute("INSERT INTO test (data) VALUES (?)", (data,))
self.myqueue.task_done()
finally:
conn.close()
q = Queue.Queue()
for i in range(100):
q.put(str(i))
conn = sqlite3.connect('test.db')
conn.execute('create table if not exists test (data text)')
conn.close()
t = DBThread(q, 'test.db')
t.start()
q.join()
q.put(None) # tell database thread to terminate
If you cannot use a sentinel value you can add a flag to the class that is checked in the while loop. Also add a stop() method to the thread class that sets the flag. You will need to use a non-blocking Queue.get():
class DBThread(threading.Thread):
def __init__(self, myqueue, db_path):
threading.Thread.__init__(self)
self.myqueue = myqueue
self.db_path = db_path
self._terminate = False
def terminate(self):
self._terminate = True
def run(self):
conn = sqlite3.connect(self.db_path)
try:
while not self._terminate:
try:
data = self.myqueue.get(timeout=1)
except Queue.Empty:
continue
with conn:
conn.execute("INSERT INTO test (data) VALUES (?)", (data,))
self.myqueue.task_done()
finally:
conn.close()
....
q.join()
t.terminate() # tell database thread to terminate
Finally, it's worth mentioning that your program could terminate if the db thread manages to drain the queue, i.e. if q.join() returns. This is because the db thread is a daemon thread and will not prevent the main thread exiting. You need to ensure that your worker threads produce enough data to keep the db thread busy, otherwise q.join() will return and the main thread will exit.
Related
import pymysql
from multiprocessing.pool import ThreadPool
def process(item):
host = str(item).strip()
db = None
try:
db = pymysql.connect(host=host, user='root', passwd='root', port=3306, connect_timeout=10,
write_timeout=10)
r = open("success.txt", 'a')
r.write(host)
r.write("\n")
pass
except Exception as msg:
print(msg)
pass
finally:
if db:
db.close()
pass
if __name__ == '__main__':
t = ThreadPool(100)
t.map_async(process,open('3306.txt').readlines())
t.close()
t.join()
I want to verify in batches whether the MySQL account passwords of some IPs are valid, but my code will have the problem of sub-thread blocking. In fact, the last IP has been verified, but the main thread has not ended. I don't know how to modify it. You can provide Some modifications and suggestions?
I am trying to write a load app on Oracle using python and I neeed some concurrency.
I am doing this by sharing a connection pool to be used by child processes but before going into that I tried to share a simple Connection object from a manager process to a child.
The connection object is shared properly using the proxy object, but when I try to create a cursor on this connection I get smth like :
>
And the cursor is not usable.
Here is my code :
import cx_Oracle
from multiprocessing import managers
from multiprocessing import current_process
from multiprocessing import Process
import time
#function to setup the connection object in manager process
def setupConnection(user,password,dsn):
conn = cx_Oracle.connect(user=user,password=password,dsn=dsn)
return conn
#proxy object for my connection
class connectionProxy(managers.BaseProxy):
def close(self):
return self._callmethod('close',args=())
def ping(self):
return self._callmethod('ping',args=())
def cursor(self):
return self._callmethod('cursor',args=())
#connection manager
class connectionManager(managers.BaseManager): pass
#child process work function
def child(conn_proxy):
print(str(current_process().name) + "Working on connection : " + str(conn_proxy))
cur = conn_proxy.cursor()
print(cur)
cur.execute('select 1 from dual');
if __name__ == '__main__' :
#db details
user = 'N974783'
password = '12345'
dsn = '192.168.56.6:1521/orcl'
#setup manager process and open the connection
manager = connectionManager()
manager.register('set_conn',setupConnection,proxytype=connectionProxy,exposed = ('close','ping','cursor'))
manager.start()
#pass the connection to the child process
conn_proxy = manager.set_conn(user=user,password=password,dsn=dsn)
p = Process(target=child, args=(conn_proxy,),name='oraWorker')
p.start()
p.join()
I get the following output:
oraWorker Working on connection : <cx_Oracle.Connection to N974783#192.168.56.6:1521/orcl>
<cx_Oracle.Cursor on <NULL>> ..
cur.execute('select 1 from dual');
cx_Oracle.InterfaceError: not open
Can someone give me an idea on how should I get past this ?
Thanks,
Ionut
The problem is that cursors cannot be passed across the boundary between processes. So you need to wrap the execute method instead. Something like this. You would need to expand it to handle bind variables and the like, of course.
import cx_Oracle
from multiprocessing import managers
from multiprocessing import current_process
from multiprocessing import Process
import time
class Connection(cx_Oracle.Connection):
def execute(self, sql):
cursor = self.cursor()
cursor.execute(sql)
return list(cursor)
#function to setup the connection object in manager process
def setupConnection(user,password,dsn):
conn = Connection(user=user,password=password,dsn=dsn)
return conn
#proxy object for my connection
class connectionProxy(managers.BaseProxy):
def close(self):
return self._callmethod('close',args=())
def ping(self):
return self._callmethod('ping',args=())
def execute(self, sql):
return self._callmethod('execute', args=(sql,))
#connection manager
class connectionManager(managers.BaseManager):
pass
#child process work function
def child(conn_proxy):
print(str(current_process().name) + "Working on connection : " + str(conn_proxy), id(conn_proxy))
result = conn_proxy.execute('select 1 from dual')
print("Result:", result)
if __name__ == '__main__' :
#db details
user = 'user'
password = 'pwd'
dsn = 'tnsentry'
#setup manager process and open the connection
manager = connectionManager()
manager.register('set_conn',setupConnection,proxytype=connectionProxy,exposed = ('close','ping','execute'))
manager.start()
#pass the connection to the child process
conn_proxy = manager.set_conn(user=user,password=password,dsn=dsn)
p = Process(target=child, args=(conn_proxy,),name='oraWorker')
p.start()
p.join()
I am using sqlite database to store the data. My python program has a thread pool which contains 5 threads. I was creating a single database connection and sharing it with all 5 threads but some times it throws exception which was not captured by any sqlite exception or generic exception and my python script got killed automatically. After searching for the solution I came across
How to share single SQLite connection in multi-threaded Python application
And I created a separate connections as follows,
class ProcessJob(object):
....
def process_job(self):
job = queue.get()
if job = 'xyz':
with sqlite3.connect(database_path, check_same_thread=False, timeout = 10) as db_conn:
db_conn.execute("insert query on table ABC")
db_conn.commit()
elif job = 'pqr':
with sqlite3.connect(database_path, check_same_thread=False, timeout = 10) as db_conn:
db_conn.execute("update query on table ABC")
db_conn.commit()
elif job = 'mno':
with sqlite3.connect(database_path, check_same_thread=False, timeout = 10) as db_conn:
db_conn.execute("insert query on table FOO")
db_conn.commit()
class MyThread(therading.Thread):
....
process_job_obj = ProcessJob()
def run(self):
while True:
try:
process_job_obj.process_job()
except Exception as e:
logger.exception('Exception : %s'%e)
def main():
for i in range(5):
trd = MyThread()
trd.start()
if __name__ == "__main__":
main()
So, Is this a right approach or is there any flaw or chances of stopping/killing the python script?
I'm trying to set up a MySQL connection pool and have my worker processes access the already established pool instead of setting up a new connection each time.
I'm confused if I should pass the database cursor to each process, or if there's some other way to do this? Shouldn't MySql.connector do the pooling automatically? When I check my log files, many, many connections are opened and closed ... one for each process.
My code looks something like this:
PATH = "/tmp"
class DB(object):
def __init__(self):
connected = False
while not connected:
try:
cnxpool = mysql.connector.pooling.MySQLConnectionPool(pool_name = "pool1",
**config.dbconfig)
self.__cnx = cnxpool.get_connection()
except mysql.connector.errors.PoolError:
print("Sleeping.. (Pool Error)")
sleep(5)
except mysql.connector.errors.DatabaseError:
print("Sleeping.. (Database Error)")
sleep(5)
self.__cur = self.__cnx.cursor(cursor_class=MySQLCursorDict)
def execute(self, query):
return self.__cur.execute(query)
def isValidFile(self, name):
return True
def readfile(self, fname):
d = DB()
d.execute("""INSERT INTO users (first_name) VALUES ('michael')""")
def main():
queue = multiprocessing.Queue()
pool = multiprocessing.Pool(None, init, [queue])
for dirpath, dirnames, filenames in os.walk(PATH):
full_path_fnames = map(lambda fn: os.path.join(dirpath, fn),
filenames)
full_path_fnames = filter(is_valid_file, full_path_fnames)
pool.map(readFile, full_path_fnames)
if __name__ == '__main__':
sys.exit(main())
First, you're creating a different connection pool for each instance of your DB class. The pools having the same name doesn't make them the same pool
From the documentation:
It is not an error for multiple pools to have the same name. An application that must distinguish pools by their pool_name property should create each pool with a distinct name.
Besides that, sharing a database connection (or connection pool) between different processes would be a bad idea (and i highly doubt it would even work correctly), so each process using it's own connections is actually what you should aim for.
You could just initialize the pool in your init initializer as a global variable and use that instead.
Very simple example:
from multiprocessing import Pool
from mysql.connector.pooling import MySQLConnectionPool
from mysql.connector import connect
import os
pool = None
def init():
global pool
print("PID %d: initializing pool..." % os.getpid())
pool = MySQLConnectionPool(...)
def do_work(q):
con = pool.get_connection()
print("PID %d: using connection %s" % (os.getpid(), con))
c = con.cursor()
c.execute(q)
res = c.fetchall()
con.close()
return res
def main():
p = Pool(initializer=init)
for res in p.map(do_work, ['select * from test']*8):
print(res)
p.close()
p.join()
if __name__ == '__main__':
main()
Or just use a simple connection instead of a connection pool, as only one connection will be active in each process at a time anyway.
The number of concurrently used connections is implicitly limited by the size of the multiprocessing.Pool.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import time
import mysql.connector.pooling
dbconfig = {
"host":"127.0.0.1",
"port":"3306",
"user":"root",
"password":"123456",
"database":"test",
}
class MySQLPool(object):
"""
create a pool when connect mysql, which will decrease the time spent in
request connection, create connection and close connection.
"""
def __init__(self, host="172.0.0.1", port="3306", user="root",
password="123456", database="test", pool_name="mypool",
pool_size=3):
res = {}
self._host = host
self._port = port
self._user = user
self._password = password
self._database = database
res["host"] = self._host
res["port"] = self._port
res["user"] = self._user
res["password"] = self._password
res["database"] = self._database
self.dbconfig = res
self.pool = self.create_pool(pool_name=pool_name, pool_size=pool_size)
def create_pool(self, pool_name="mypool", pool_size=3):
"""
Create a connection pool, after created, the request of connecting
MySQL could get a connection from this pool instead of request to
create a connection.
:param pool_name: the name of pool, default is "mypool"
:param pool_size: the size of pool, default is 3
:return: connection pool
"""
pool = mysql.connector.pooling.MySQLConnectionPool(
pool_name=pool_name,
pool_size=pool_size,
pool_reset_session=True,
**self.dbconfig)
return pool
def close(self, conn, cursor):
"""
A method used to close connection of mysql.
:param conn:
:param cursor:
:return:
"""
cursor.close()
conn.close()
def execute(self, sql, args=None, commit=False):
"""
Execute a sql, it could be with args and with out args. The usage is
similar with execute() function in module pymysql.
:param sql: sql clause
:param args: args need by sql clause
:param commit: whether to commit
:return: if commit, return None, else, return result
"""
# get connection form connection pool instead of create one.
conn = self.pool.get_connection()
cursor = conn.cursor()
if args:
cursor.execute(sql, args)
else:
cursor.execute(sql)
if commit is True:
conn.commit()
self.close(conn, cursor)
return None
else:
res = cursor.fetchall()
self.close(conn, cursor)
return res
def executemany(self, sql, args, commit=False):
"""
Execute with many args. Similar with executemany() function in pymysql.
args should be a sequence.
:param sql: sql clause
:param args: args
:param commit: commit or not.
:return: if commit, return None, else, return result
"""
# get connection form connection pool instead of create one.
conn = self.pool.get_connection()
cursor = conn.cursor()
cursor.executemany(sql, args)
if commit is True:
conn.commit()
self.close(conn, cursor)
return None
else:
res = cursor.fetchall()
self.close(conn, cursor)
return res
if __name__ == "__main__":
mysql_pool = MySQLPool(**dbconfig)
sql = "select * from store WHERE create_time < '2017-06-02'"
p = Pool()
for i in range(5):
p.apply_async(mysql_pool.execute, args=(sql,))
Code above creates a connection pool at the beginning, and get connections from it in execute(), once the connection pool has been created, the work is to remain it, since the pool is created only once, it will save the time to request for a connection every time you would like to connect to MySQL.
Hope it helps!
You created multiple DB object instance. In mysql.connector.pooling.py, pool_name is only a attribute to let you make out which pool it is. There is no mapping in the mysql pool.
So, you create multiple DB instance in def readfile(), then you will have several connection pool.
A Singleton is useful in this case.
(I spent several hours to find it out. In Tornado framework, each http get create a new handler, which leads to making a new connection.)
There may be synchronization issues if you're going to reuse MySQLConnection instances maintained by a pool, but just sharing a MySQLConnectionPool instance between worker processes and using connections retrieved by calling the method get_connection() would be okay, because a dedicated socket would be created for each MySQLConnection instance.
import multiprocessing
from mysql.connector import pooling
def f(cnxpool: pooling.MySQLConnectionPool) -> None:
# Dedicate connection instance for each worker process.
cnx = cnxpool.get_connection()
...
if __name__ == '__main__':
cnxpool = pooling.MySQLConnectionPool(
pool_name='pool',
pool_size=2,
)
p0 = multiprocessing.Process(target=f, args=(cnxpool,))
p1 = multiprocessing.Process(target=f, args=(cnxpool,))
p0.start()
p1.start()
I'm writing simple script that will check for SSH connection, and I cannot understand, why it hangs on one thread.
class myThread(threading.Thread):
def __init__(self, hostname ):
threading.Thread.__init__(self)
self.hostname = hostname
def run(self):
return self.doSSH(self.hostname)
def doSSH(self,hostname):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((hostname, 22))
result = s.recv(1024)
if re.findall(r'^SSH.+?SSH.+',result) :
return "Up"
else :
return "Down"
def main():
q = Queue.Queue()
completeHostlist = ["host1","host2","google.com","host3"]
for hostname in completeHostlist:
thread = myThread(hostname)
thread.daemon = True
q.put_nowait(thread.run())
q.get_nowait()
I don't understand why this script hangs at google.com? I would be expecting it to spawn daemon thread and continue with host3. As soon it finish host3 it has to kill thread with Google and return results. What I did wrong ?
I already figured out about run() and start(). Anyway this is not working as expected, after all host[1-3] threads was started, script stuck at thread with google , waiting it to end. Should it be kill it at the end of the script ?
Should i be using multiprocessing instead of multithreading , to spawn separate process for each host?
In your code you do q.put_nowait(thread.run()). That immediately runs the ssh thing on the current thread. You need to call thread specific method to start the thread. You need to call thread.start().
Not sure what you're doing with the Queue.
Don't call .run() method directly for any thread. As #Sorin said call thread.start() instead.
You don't need to define a new thread class, a function is enough in this case:
from Queue import Queue
from threading import Thread
def is_ssh_up(result_queue, hostname, port=22):
# try to connect here
# ...
# write results
result_queue.put((hostname, True)) # Up
def main():
q = Queue()
hosts = ["host1", "host2", "google.com", "host3"]
for hostname in hosts: # start worker threads
t = Thread(target=is_ssh_up, args=[q, hostname])
t.daemon = True
t.start()
for _ in hosts: # collect results
hostname, is_up = q.get()
print("%s is %s" % (hostname, "Up" if is_up else "Down"))
Or you could use a thread pool:
from multiprocessing.pool import ThreadPool
def is_ssh_up(hostname, port=22):
# try to connect here
# ...
# return results
return hostname, is_up
hosts = ["host1", "host2", "google.com", "host3"]
pool = ThreadPool(20) # limit number of concurrent connections to 20
for hostname, is_up in pool.imap_unordered(is_ssh_up, hosts):
status = "Up" if is_up else "Down" if is_up is not None else "Unknown"
print("%s status is %s" % (hostname, status))