Read sql queries via pandas quickly with pyoracle - python

I am using oracle sql developer and have built a script to read sql queries in parallel as well as under a thread. However, I have noticed no significant difference in speed by implementing this (even with chunksizes), than reading the table directly. Therefore, could my approach be wrong and what's the improvement to my approach, to speed things up?
For example:
#My table size is only 38k rows and this takes ~ 1.2 minutes to run
def table(self, table = None, query = None, chunksize = None):
from concurrent.futures import ThreadPoolExecutor
with self._ENGINE.connect() as conn:
tables = []
if query is None and table is not None:
with ThreadPoolExecutor(max_workers = 8) as executor:
for results in executor.submit(pd.read_sql,f"SELECT /*+ PARALLEL(16) */ NAME FROM {table}", conn, chunksize=chunksize).result():
tables.append(results)
table = pd.concat([pd.concat([x]) for x in tables])
conn.close()
return table
else:
print('something else')

After reading the following documentation:
tuning fetch
It takes approximately 118 seconds for the code above to run, whereas after a slight remodification:
def table2(self, table = None, query = None):
from concurrent.futures import ThreadPoolExecutor
self._cursor.arraysize = 10000
self._cursor.prefetchrows = 1000000
tables = []
start = time.time()
if query is None and table is not None:
with ThreadPoolExecutor(max_workers = 8) as executor:
for results in executor.submit(self._cursor.execute,f"SELECT /*+ PARALLEL(16) */ NAME FROM {table}").result():
tables.append(results)
end = time.time()
start_second = time.time()
self._cursor.execute(f"SELECT /*+ PARALLEL(16) */ NAME FROM {table}").fetchall()
end_second = time.time()
print("Threadpool time: %s, fetchall time: %s" % (str(end-start), str(end_second-start_second)))
Takes the following time to execute:
Threadpool time: 1.0487918853759766, fetchall time: 0.48572492599487305

Here's an example of fetching data from a single table using multiple connections. Whether it's faster than a single thread doing a full table scan is something for you to check. Maybe Python's GIL is a bottleneck. Maybe your database is on a single disk instead of multiple disks, so there is no extra throughput possible. Maybe the OFFSET/FETCH NEXT and ORDER BY are a limiting factor because the DB is busy doing work for other users (or maybe you're the only user so they are fast). Maybe for you it's what you do with the data when you get it in Python that will be a bottleneck.
Fundamentally from the Oracle side, tuning arraysize will be the biggest factor for any single SELECT that returns a large number of rows over a slower network.
# Fetch batches of rows from a table using multiple connections
import csv
import os
import platform
import threading
import oracledb
# To fetch everything, keep NUM_THREADS * BATCH_SIZE >= TABLE_SIZE
# number of rows to insert the demo table
TABLE_SIZE = 10000
# The degree of parallelism / number of connections to open
NUM_THREADS = 10
# How many rows to fetch in each thread
BATCH_SIZE = 1000
# Internal buffer size: Tune this for performance
ARRAY_SIZE = 1000
SQL = """
select data
from demo
order by id
offset :rowoffset rows fetch next :maxrows rows only
"""
un = os.environ.get('PYTHON_USERNAME')
pw = os.environ.get('PYTHON_PASSWORD')
cs = os.environ.get('PYTHON_CONNECTSTRING')
if os.environ.get('DRIVER_TYPE') == 'thick':
ld = None
if platform.system() == 'Darwin' and platform.machine() == 'x86_64':
ld = os.environ.get('HOME')+'/Downloads/instantclient_19_8'
elif platform.system() == 'Windows':
ld = r'C:\oracle\instantclient_19_17'
oracledb.init_oracle_client(lib_dir=ld)
# Create a connection pool
pool = oracledb.create_pool(user=un, password=pw, dsn=cs, min=NUM_THREADS, max=NUM_THREADS)
#
# Create the table for the demo
#
def create_schema():
with oracledb.connect(user=un, password=pw, dsn=cs) as connection:
with connection.cursor() as cursor:
connection.autocommit = True
cursor.execute("""
begin
begin
execute immediate 'drop table demo';
exception when others then
if sqlcode <> -942 then
raise;
end if;
end;
execute immediate 'create table demo (
id number generated by default as identity,
data varchar2(40))';
insert into demo (data)
select to_char(rownum)
from dual
connect by level <= :table_size;
end;""", table_size=TABLE_SIZE)
# Write the data to separate CSV files
def do_write_csv(tn):
with pool.acquire() as connection:
with connection.cursor() as cursor:
cursor.arraysize = ARRAY_SIZE
f = open(f"emp{tn}.csv", "w")
writer = csv.writer(f, lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
cursor.execute(SQL, rowoffset=(tn*BATCH_SIZE), maxrows=BATCH_SIZE)
col_names = [row[0] for row in cursor.description]
writer.writerow(col_names)
while True:
rows = cursor.fetchmany() # extra call at end won't incur extra round-trip
if not rows:
break
writer.writerows(rows)
f.close()
# Print the data to the terminal
def do_query(tn):
with pool.acquire() as connection:
with connection.cursor() as cursor:
cursor.arraysize = ARRAY_SIZE
cursor.execute(SQL, rowoffset=(tn*BATCH_SIZE), maxrows=BATCH_SIZE)
while True:
rows = cursor.fetchmany() # extra call at end won't incur extra round-trip
if not rows:
break
print(f'Thread {tn}', rows)
#
# Start the desired number of threads.
#
def start_workload():
thread = []
for i in range(NUM_THREADS):
t = threading.Thread(target=do_write_csv, args=(i,))
#t = threading.Thread(target=do_query, args=(i,))
t.start()
thread.append(t)
for i in range(NUM_THREADS):
thread[i].join()
if __name__ == '__main__':
create_schema()
start_workload()
print("All done!")

Related

Create a process from a function that will run in parallel in Python

I have a function that executes a SELECT sql query (using postgresql).
Now, I want to INSERT to some table in my DB the execution time of this query, however, I want to do it in parallel, so that even if my INSERT query is still running I will be able to continue my program and call other functions.
I tries to use multiprocessing.Process, however, my function is waiting for the process to finish and I'm actually losing the effect of the parallelism I wanted.
My code in a nut shell:
def select_func():
with connection.cursor() as cursor:
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'"
start = time.time()
cursor.execute(query)
end = time.time()
process = Process(target = insert_func, args = (query, (end-start)))
process.start()
process.join()
return cursor.fetchall()
def insert_func(query, time):
with connection.cursor() as cursor:
query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")"
cursor.execute(query)
connection.commit()
Now the problem is that this operation is not really async, since select_func is waiting until insert_function is finished. I want that the execution of these functions won't be depended and that the select function could end even though insert_function is still running so that I will be able to continue and call other function in my script.
Thanks!
Quite a lot of issues with your code snippet but lets try to at least give a structure to implement.
def select_func():
with connection.cursor() as cursor: #I dont think the same global variable connectino should be used for read/write simultaneously
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'" #quotation issues
start = time.time()
cursor.execute(query)
end = time.time()
process = Process(target = insert_func, args = (query, (end-start)))
process.start() #you start the process here BUT
process.join() #you force python to wait for it here....
return cursor.fetchall()
def insert_func(query, time):
with connection.cursor() as cursor:
query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")"
cursor.execute(query)
connection.commit()
Consider an alternative:
def select_func():
read_con = sql.connect() #sqlite syntax but use your connection
with read_con.cursor() as cursor:
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'" #where does Alice come from?
start = time.time()
cursor.execute(query)
end = time.time()
return cursor.fetchall(),(query,(end-start)) #Our tuple has query at position 0 and time at position 1
def insert_function(insert_queue): #The insert you want to parallleize
connection = sql.connect("db") #initialize your 'writer'. Note: May be good to initialize the connection on each insert. Not sure if optimal.
while True: #We keep pulling from the pipe
data = insert_queue.get() # we pull from our pipe
if data == 'STOP': #Example of a kill instruction to stop our process
break #breaks the while loop and the function can 'exit'
with connection.cursor() as cursor:
query_data = data #I assume you would want to pass your query through the pipe
query= query_data[0] #see how we stored the tuple
time = query_data[1] #as above
insert_query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")" #Somehow query and time goes into the insert_query
cursor.execute(insert_query)
connection.commit()
if __name__ == '__main__': #Typical python main thread
query_pipe = Queue() #we initialize a Queue here to feed into your inserting function
process = Process(target = insert_func,args = (query_pipe,)
process.start()
stuff = []
for i in range(5):
data,insert_query = select_function() #select function lets say it gets the data you want to insert.
stuff.append(data)
query_pipe.put(insert_query)
#
#Do other stuff and even put more stuff into the pipe.
#
query_pipe.put('STOP') #we wanna kill our process so we send the stop command
process.join()

How to optimize fetch from cursor with 5 millions raw

I got a table from MSSQL with 5M rows and when I fetch all the rows of this table, this take me 2~3 minutes. I want (if possible) to optimize that.
That's my code :
cursor.execute("SELECT * FROM MyTable")
rows = cursor.fetchall() # that takes 2~3 minutes
# some code for setup the output that take only few seconds
I already tried, to used :
while True:
rows = cursor.fetchmany(500000)
if not rows:
break
# Do some stuff
And Also with fetchone.
But again i'm between 2-3 mins :/ How to optimize that ? Maybe using thread but I don't know how.
thanks for your help.
I think you can limit the number of lines returned by your query even if you have to make several calls to your database.
About the Threads, you have several solutions:
A single connection but a different cursor for each Thread
One connection for each Thread and one cursor from that connection
In any case you need a ThreadedConnectionPool. Here is a small example of one of the ways to do it
import psycopg2
from psycopg2 import pool
from threading import Thread
from time import sleep
threaded_connection_pool = None
thread_table = list()
def get_new_connection():
global threaded_postgreSQL_pool
connection = None
while not isinstance(connection, psycopg2.extensions.connection):
try:
connection = threaded_postgreSQL_pool.getconn()
except pool.PoolError:
sleep(10) # Wait a free connection
return connection, connection.cursor()
def thread_target():
connection, cursor = get_new_connection()
with connection, cursor:
# Do some stuff
pass
threaded_connection_pool = psycopg2.pool.ThreadedConnectionPool(
# YOUR PARAM
)
for counter_thread in range(10):
thread = Thread(
target=thread_target,
name=f"Thread n°{counter_thread}"
)
thread_table.append(thread)
thread.start()
#
# Do many more stuff
#
for thread in thread_table:
thread.join()
# End
I prefer to use the first solution "A single connection but a different cursor for each Thread"
For that : I have to do something like that ?
result = []
cursor = connection.cursor()
def fetch_cursor(cursor):
global result
rows = cursor.fetchall()
if rows:
result += beautify_output(rows)
######### THIS CODE BELOW IS INSIDE A FUNCTION ######
thread_table = []
limit = 1000000
offset = 0
sql = "SELECT * FROM myTABLE"
while True:
try:
cursor.execute(f"{sql} LIMIT {limit} OFFSET {offset}")
except Exception as e:
break
offset += limit
thread = Thread(target=fetch_cursor, args=(cursor,))
thread_table.append(thread)
thread.start()
for thread in thread_table:
thread.join()
print(result)
So something like that should work ? (I will try that tommorow)

More efficient way to query this SQL table from python?

I need to query rows where a column matches my list of ~60K IDs out of a table that contains millions of IDs. I think normally you would insert a temporary table into the database and merge on that but I can't edit this database. I am doing it like this using a loop w/ a python wrapper, but is there a better way? I mean it works, but still:
import pyodbc
import pandas as pd
# connect to the database using windows authentication
conn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=my_fav_server;DATABASE=my_fav_db;Trusted_Connection=yes;')
cursor = conn.cursor()
# read in all the ids
ids_list = [...60K ids in here..]
# query in 10K chunks to prevent memory error
def chunks(l,n):
# split list into n lists of evenish size
n = max(1,n)
return [l[i:i+n] for i in range(0,len(l), n)]
chunked_ids_lists = chunks(ids_list, 10000)
# looping through to retrieve all cols
for chunk_num, chunked_ids_list in enumerate(chunked_ids_lists):
temp_ids_string = "('" + "','".join(chunked_ids_list) + "')"
temp_sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN {temp_ids_string};"
temp_data = pd.read_sql_query(temp_sql, conn)
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data.to_csv(temp_path, sep='\t', index=None)
# read the query chunks
all_data_list = []
for chunk_num in range(len(chunked_ids_lists)):
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data = pd.read_csv(temp_path, sep='\t')
all_data_list.append(temp_data)
all_data = pd.concat(all_data_list)
Another way use Psycopg's cursor.
import psycopg2
# Connect to an existing database
conn = psycopg2.connect("dbname=test user=postgres")
# Open a cursor to perform database operations
cur = conn.cursor()
# get data from query
# no need construct 'SQL-correct syntax' filter
cur.execute("SELECT * FROM dbo.my_fav_table WHERE ID IN %(filter)s;", {"filter": chunked_ids_lists})
# loop over getted rows
for record in cur:
# we got one record
print(record) # or make other data treatment
Use parameters rather than concatenating strings.
I don't see the need for the CSV files, if you're just going to read them all into Python in the next loop. Just put everything into all_data_list during the query loop.
all_data_list = []
for chunk in chunked_ids_lists:
params = ','.join(['?'] * len(chunk))
sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN ({params});"
cursor.execute(sql, chunk)
rows = cursor.fetchall()
all_data_list.extend(rows)
all_data = pd.dataFrame(all_data_list)

Parallelize all iterations of a for loop

I need to parallelize a for loop to reduce it's execution time.
I'm replicating a database to 10 others using pymysql.
from mysql.connector import MySQLConnection, Error
import pymysql
import csv
import config
import multiprocessing
def query_with_fetchall(i):
conn = MySQLConnection(host = config.host,
user = config.user,
password = config.passwd,
db = config.db_name)
cursor = conn.cursor()
sql = '''USE ultimate;
CREATE TABLE IF NOT EXISTS orders_user{0} (id VARCHAR(10), amazon_order_id VARCHAR(100), merchant_order_id VARCHAR(100), purchase$
sqla = ''LOAD DATA LOCAL INFILE '/home/ec2-user/order.csv' INTO TABLE orders_user{0} FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' '''
for result in cursor.execute(sql, multi=True):
pass
cursor.execute(sqla)
conn.commit()
conn.commit()
conn.close()
print("s")
jobs = []
for i in range(2):
try:
p1 = Process(target = query_with_fetchall, args =(i,))
print("done")
jobs.append(p1)
except Exception:
import traceback
print(traceback.format_exc())
pass
for j in jobs:
j.start()
for j in jobs:
j.join
Time taken to replocate 10 tables is 10 minutes approximately i.e. 1 minute each, I want to replicate all the tables parallely in 1 minute.

Process very large 900M row MySQL table line by line with Python

I often need to process several hundred million rows of a MySQL table on a line by line basis using Python. I want a script that is robust and does not need to be monitored.
Below I pasted a script that classifying the language of the message field in my row. It utilizes the sqlalchemy and MySQLdb.cursors.SSCursor modules. Unfortunately this script consistently throws a 'Lost connection to MySQL server during query' error after 4840 rows when I run remotely and 42000 rows when I run locally.
Also, I have checked and max_allowed_packet = 32M on my MySQL server's /etc/mysql/my.cnf file as per the answers to this stackoverflow question Lost connection to MySQL server during query
Any advice for either fixing this error, or using another approach to use Python for processing very large MySQL files in a robust way would be much appreciated!
import sqlalchemy
import MySQLdb.cursors
import langid
schema = "twitterstuff"
table = "messages_en" #900M row table
engine_url = "mysql://myserver/{}?charset=utf8mb4&read_default_file=~/.my.cnf".format(schema)
db_eng = sqlalchemy.create_engine(engine_url, connect_args={'cursorclass': MySQLdb.cursors.SSCursor} )
langid.set_languages(['fr', 'de'])
print "Executing input query..."
data_iter = db_eng.execute("SELECT message_id, message FROM {} WHERE langid_lang IS NULL LIMIT 10000".format(table))
def process(inp_iter):
for item in inp_iter:
item = dict(item)
(item['langid_lang'], item['langid_conf']) = langid.classify(item['message'])
yield item
def update_table(update_iter):
count = 0;
for item in update_iter:
count += 1;
if count%10 == 0:
print "{} rows processed".format(count)
lang = item['langid_lang']
conf = item['langid_conf']
message_id = item['message_id']
db_eng.execute("UPDATE {} SET langid_lang = '{}', langid_conf = {} WHERE message_id = {}".format(table, lang, conf, message_id))
data_iter_upd = process(data_iter)
print "Begin processing..."
update_table(data_iter_upd)
According to MySQLdb developer Andy Dustman,
[When using SSCursor,] no new queries can be issued on the connection until
the entire result set has been fetched.
That post says that if you issue another query you will get a "commands out of sequence" error, which is not the error you are seeing. So I am not sure that the following will necessarily fix your problem. Nevertheless, it might be worth trying to remove SSCursor from your code and use the simpler default Cursor just to test if that is the source of the problem.
You could, for example, use LIMIT chunksize OFFSET n in your SELECT statement
to loop through the data set in chunks:
import sqlalchemy
import MySQLdb.cursors
import langid
import itertools as IT
chunksize = 1000
def process(inp_iter):
for item in inp_iter:
item = dict(item)
(item['langid_lang'], item['langid_conf']) = langid.classify(item['message'])
yield item
def update_table(update_iter, engine):
for count, item in enumerate(update_iter):
if count%10 == 0:
print "{} rows processed".format(count)
lang = item['langid_lang']
conf = item['langid_conf']
message_id = item['message_id']
engine.execute(
"UPDATE {} SET langid_lang = '{}', langid_conf = {} WHERE message_id = {}"
.format(table, lang, conf, message_id))
schema = "twitterstuff"
table = "messages_en" #900M row table
engine_url = ("mysql://myserver/{}?charset=utf8mb4&read_default_file=~/.my.cnf"
.format(schema))
db_eng = sqlalchemy.create_engine(engine_url)
langid.set_languages(['fr', 'de'])
for offset in IT.count(start=0, step=chunksize):
print "Executing input query..."
result = db_eng.execute(
"SELECT message_id, message FROM {} WHERE langid_lang IS NULL LIMIT {} OFFSET {}"
.format(table, chunksize, offset))
result = list(result)
if not result: break
data_iter_upd = process(result)
print "Begin processing..."
update_table(data_iter_upd, db_eng)

Categories

Resources