I need to parallelize a for loop to reduce it's execution time.
I'm replicating a database to 10 others using pymysql.
from mysql.connector import MySQLConnection, Error
import pymysql
import csv
import config
import multiprocessing
def query_with_fetchall(i):
conn = MySQLConnection(host = config.host,
user = config.user,
password = config.passwd,
db = config.db_name)
cursor = conn.cursor()
sql = '''USE ultimate;
CREATE TABLE IF NOT EXISTS orders_user{0} (id VARCHAR(10), amazon_order_id VARCHAR(100), merchant_order_id VARCHAR(100), purchase$
sqla = ''LOAD DATA LOCAL INFILE '/home/ec2-user/order.csv' INTO TABLE orders_user{0} FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' '''
for result in cursor.execute(sql, multi=True):
pass
cursor.execute(sqla)
conn.commit()
conn.commit()
conn.close()
print("s")
jobs = []
for i in range(2):
try:
p1 = Process(target = query_with_fetchall, args =(i,))
print("done")
jobs.append(p1)
except Exception:
import traceback
print(traceback.format_exc())
pass
for j in jobs:
j.start()
for j in jobs:
j.join
Time taken to replocate 10 tables is 10 minutes approximately i.e. 1 minute each, I want to replicate all the tables parallely in 1 minute.
Related
I am using oracle sql developer and have built a script to read sql queries in parallel as well as under a thread. However, I have noticed no significant difference in speed by implementing this (even with chunksizes), than reading the table directly. Therefore, could my approach be wrong and what's the improvement to my approach, to speed things up?
For example:
#My table size is only 38k rows and this takes ~ 1.2 minutes to run
def table(self, table = None, query = None, chunksize = None):
from concurrent.futures import ThreadPoolExecutor
with self._ENGINE.connect() as conn:
tables = []
if query is None and table is not None:
with ThreadPoolExecutor(max_workers = 8) as executor:
for results in executor.submit(pd.read_sql,f"SELECT /*+ PARALLEL(16) */ NAME FROM {table}", conn, chunksize=chunksize).result():
tables.append(results)
table = pd.concat([pd.concat([x]) for x in tables])
conn.close()
return table
else:
print('something else')
After reading the following documentation:
tuning fetch
It takes approximately 118 seconds for the code above to run, whereas after a slight remodification:
def table2(self, table = None, query = None):
from concurrent.futures import ThreadPoolExecutor
self._cursor.arraysize = 10000
self._cursor.prefetchrows = 1000000
tables = []
start = time.time()
if query is None and table is not None:
with ThreadPoolExecutor(max_workers = 8) as executor:
for results in executor.submit(self._cursor.execute,f"SELECT /*+ PARALLEL(16) */ NAME FROM {table}").result():
tables.append(results)
end = time.time()
start_second = time.time()
self._cursor.execute(f"SELECT /*+ PARALLEL(16) */ NAME FROM {table}").fetchall()
end_second = time.time()
print("Threadpool time: %s, fetchall time: %s" % (str(end-start), str(end_second-start_second)))
Takes the following time to execute:
Threadpool time: 1.0487918853759766, fetchall time: 0.48572492599487305
Here's an example of fetching data from a single table using multiple connections. Whether it's faster than a single thread doing a full table scan is something for you to check. Maybe Python's GIL is a bottleneck. Maybe your database is on a single disk instead of multiple disks, so there is no extra throughput possible. Maybe the OFFSET/FETCH NEXT and ORDER BY are a limiting factor because the DB is busy doing work for other users (or maybe you're the only user so they are fast). Maybe for you it's what you do with the data when you get it in Python that will be a bottleneck.
Fundamentally from the Oracle side, tuning arraysize will be the biggest factor for any single SELECT that returns a large number of rows over a slower network.
# Fetch batches of rows from a table using multiple connections
import csv
import os
import platform
import threading
import oracledb
# To fetch everything, keep NUM_THREADS * BATCH_SIZE >= TABLE_SIZE
# number of rows to insert the demo table
TABLE_SIZE = 10000
# The degree of parallelism / number of connections to open
NUM_THREADS = 10
# How many rows to fetch in each thread
BATCH_SIZE = 1000
# Internal buffer size: Tune this for performance
ARRAY_SIZE = 1000
SQL = """
select data
from demo
order by id
offset :rowoffset rows fetch next :maxrows rows only
"""
un = os.environ.get('PYTHON_USERNAME')
pw = os.environ.get('PYTHON_PASSWORD')
cs = os.environ.get('PYTHON_CONNECTSTRING')
if os.environ.get('DRIVER_TYPE') == 'thick':
ld = None
if platform.system() == 'Darwin' and platform.machine() == 'x86_64':
ld = os.environ.get('HOME')+'/Downloads/instantclient_19_8'
elif platform.system() == 'Windows':
ld = r'C:\oracle\instantclient_19_17'
oracledb.init_oracle_client(lib_dir=ld)
# Create a connection pool
pool = oracledb.create_pool(user=un, password=pw, dsn=cs, min=NUM_THREADS, max=NUM_THREADS)
#
# Create the table for the demo
#
def create_schema():
with oracledb.connect(user=un, password=pw, dsn=cs) as connection:
with connection.cursor() as cursor:
connection.autocommit = True
cursor.execute("""
begin
begin
execute immediate 'drop table demo';
exception when others then
if sqlcode <> -942 then
raise;
end if;
end;
execute immediate 'create table demo (
id number generated by default as identity,
data varchar2(40))';
insert into demo (data)
select to_char(rownum)
from dual
connect by level <= :table_size;
end;""", table_size=TABLE_SIZE)
# Write the data to separate CSV files
def do_write_csv(tn):
with pool.acquire() as connection:
with connection.cursor() as cursor:
cursor.arraysize = ARRAY_SIZE
f = open(f"emp{tn}.csv", "w")
writer = csv.writer(f, lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
cursor.execute(SQL, rowoffset=(tn*BATCH_SIZE), maxrows=BATCH_SIZE)
col_names = [row[0] for row in cursor.description]
writer.writerow(col_names)
while True:
rows = cursor.fetchmany() # extra call at end won't incur extra round-trip
if not rows:
break
writer.writerows(rows)
f.close()
# Print the data to the terminal
def do_query(tn):
with pool.acquire() as connection:
with connection.cursor() as cursor:
cursor.arraysize = ARRAY_SIZE
cursor.execute(SQL, rowoffset=(tn*BATCH_SIZE), maxrows=BATCH_SIZE)
while True:
rows = cursor.fetchmany() # extra call at end won't incur extra round-trip
if not rows:
break
print(f'Thread {tn}', rows)
#
# Start the desired number of threads.
#
def start_workload():
thread = []
for i in range(NUM_THREADS):
t = threading.Thread(target=do_write_csv, args=(i,))
#t = threading.Thread(target=do_query, args=(i,))
t.start()
thread.append(t)
for i in range(NUM_THREADS):
thread[i].join()
if __name__ == '__main__':
create_schema()
start_workload()
print("All done!")
I have a function that executes a SELECT sql query (using postgresql).
Now, I want to INSERT to some table in my DB the execution time of this query, however, I want to do it in parallel, so that even if my INSERT query is still running I will be able to continue my program and call other functions.
I tries to use multiprocessing.Process, however, my function is waiting for the process to finish and I'm actually losing the effect of the parallelism I wanted.
My code in a nut shell:
def select_func():
with connection.cursor() as cursor:
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'"
start = time.time()
cursor.execute(query)
end = time.time()
process = Process(target = insert_func, args = (query, (end-start)))
process.start()
process.join()
return cursor.fetchall()
def insert_func(query, time):
with connection.cursor() as cursor:
query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")"
cursor.execute(query)
connection.commit()
Now the problem is that this operation is not really async, since select_func is waiting until insert_function is finished. I want that the execution of these functions won't be depended and that the select function could end even though insert_function is still running so that I will be able to continue and call other function in my script.
Thanks!
Quite a lot of issues with your code snippet but lets try to at least give a structure to implement.
def select_func():
with connection.cursor() as cursor: #I dont think the same global variable connectino should be used for read/write simultaneously
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'" #quotation issues
start = time.time()
cursor.execute(query)
end = time.time()
process = Process(target = insert_func, args = (query, (end-start)))
process.start() #you start the process here BUT
process.join() #you force python to wait for it here....
return cursor.fetchall()
def insert_func(query, time):
with connection.cursor() as cursor:
query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")"
cursor.execute(query)
connection.commit()
Consider an alternative:
def select_func():
read_con = sql.connect() #sqlite syntax but use your connection
with read_con.cursor() as cursor:
query = "SELECT * FROM myTable WHERE \"UserName\" = 'Alice'" #where does Alice come from?
start = time.time()
cursor.execute(query)
end = time.time()
return cursor.fetchall(),(query,(end-start)) #Our tuple has query at position 0 and time at position 1
def insert_function(insert_queue): #The insert you want to parallleize
connection = sql.connect("db") #initialize your 'writer'. Note: May be good to initialize the connection on each insert. Not sure if optimal.
while True: #We keep pulling from the pipe
data = insert_queue.get() # we pull from our pipe
if data == 'STOP': #Example of a kill instruction to stop our process
break #breaks the while loop and the function can 'exit'
with connection.cursor() as cursor:
query_data = data #I assume you would want to pass your query through the pipe
query= query_data[0] #see how we stored the tuple
time = query_data[1] #as above
insert_query = "INSERT INTO infoTable (\"query\", \"exec_time\")
VALUES (\"" + query + "\", \"" + time + "\")" #Somehow query and time goes into the insert_query
cursor.execute(insert_query)
connection.commit()
if __name__ == '__main__': #Typical python main thread
query_pipe = Queue() #we initialize a Queue here to feed into your inserting function
process = Process(target = insert_func,args = (query_pipe,)
process.start()
stuff = []
for i in range(5):
data,insert_query = select_function() #select function lets say it gets the data you want to insert.
stuff.append(data)
query_pipe.put(insert_query)
#
#Do other stuff and even put more stuff into the pipe.
#
query_pipe.put('STOP') #we wanna kill our process so we send the stop command
process.join()
I have a script that read some files one by one, clean it and insert it ino postgres database.
i tried to use python multi-processing using Pools but actually i found that CPU usage still reach sometimes to 30% and return most of time to 6%. so it's really so slow.
Any suggestion for speeding it up ?
thank you
import os
import multiprocessing
path = 'data/'
arr = os.listdir(path)
connection = psycopg2.connect(
user="postgres", password="blabla", host="127.0.0.1", port="5432", database="test"
)
cursor = connection.cursor()
postgres_insert_query = """ INSERT INTO mobile (data1, data2) VALUES (%s,%s)
ON CONFLICT (data1)
DO
UPDATE SET data2 = EXCLUDED.data1 ;"""
def insert_data(key,record_to_insert, item):
print(key)
try:
cursor.executemany(postgres_insert_query, record_to_insert)
connection.commit()
count = cursor.rowcount
print(count, "Record inserted successfully into mobile table", item)
except (Exception, psycopg2.Error) as error:
print("Failed to insert record into mobile table", error)
i = 1
def process_data(item):
print(item)
global i
records = []
i+=1
with open(path+item,'r') as file:
for line in file:
line = dataCleansing(line)
records.append((line+'-'+str(i),'data2-'+str(i)+line))
if len(records)==50000:
insert_data(i,records,item)
records=[]
insert_data(i,records,item)
records=[]
if __name__ == '__main__':
a_pool = multiprocessing.Pool(6)
result = a_pool.map(process_data, arr)
My usecase is to write create a temp table in the postgres database and fetch records from it and insert into a different table.
The code i used is:
import psycopg2
import sys
import pprint
from __future__ import print_function
from os.path import join,dirname,abspath
import xlrd
import os.path
newlist = []
itemidlist = []
def main():
conn_string = "host='prod-dump.cvv9i14mrv4k.us-east-1.rds.amazonaws.com' dbname='ebdb' user='ebroot' password='*********'"
# print the connection string we will use to connect
# print "Connecting to database" % (conn_string)
# get a connection, if a connect cannot be made an exception will be raised here
conn = psycopg2.connect(conn_string)
# conn.cursor will return a cursor object, you can use this cursor to perform queries
cursor = conn.cursor()
dealer_id = input("Please enter dealer_id: ")
group_id = input("Please enter group_id: ")
scriptpath = os.path.dirname('__file__')
filename = os.path.join(scriptpath, 'Winco - Gusti.xlsx')
xl_workbook = xlrd.open_workbook(filename, "rb")
xl_sheet = xl_workbook.sheet_by_index(0)
print('Sheet Name: %s' % xl_sheet.name)
row=xl_sheet.row(0)
from xlrd.sheet import ctype_text
print('(Column #) type:value')
for idx, cell_obj in enumerate(row):
cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type')
#print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value))
num_cols = xl_sheet.ncols
for row_idx in range(0, xl_sheet.nrows): # Iterate through rows
num_cols = xl_sheet.ncols
id_obj = xl_sheet.cell(row_idx, 1) # Get cell object by row, col
itemid = id_obj.value
#if itemid not in itemidlist:
itemidlist.append(itemid)
# execute our Query
'''
cursor.execute("""
if not exists(SELECT 1 FROM model_enable AS c WHERE c.name = %s);
BEGIN;
INSERT INTO model_enable (name) VALUES (%s)
END;
""" %(itemid,itemid))
'''
cursor.execute("drop table temp_mbp1")
try:
cursor.execute("SELECT p.model_no, pc.id as PCid, g.id AS GROUPid into public.temp_mbp1 FROM products p, \
model_enable me, products_clients pc, groups g WHERE p.model_no = me.name \
and p.id = pc.product_id and pc.client_id = %s and pc.client_id = g.client_id and g.id = %s"\
% (dealer_id,group_id)
except (Exception, psycopg2.DatabaseError) as error:
print(error)
cursor.execute("select count(*) from public.temp_mbp1")
# retrieve the records from the database
records = cursor.fetchall()
# print out the records using pretty print
# note that the NAMES of the columns are not shown, instead just indexes.
# for most people this isn't very useful so we'll show you how to return
# columns as a dictionary (hash) in the next example.
pprint.pprint(records)
if __name__ == "__main__":
main()
The try except block in between the program is not throwing any error but the table is not getting created in the postgres database as i see in the data admin.
The output shown is:
Please enter dealer_id: 90
Please enter group_id: 13
Sheet Name: Winco Full 8_15_17
(Column #) type:value
[(3263,)]
Thanks,
Santosh
You didn't commit the changes, so they aren't saved in the database. Add to the bottom, just below the pprint statement:
conn.commit()
I'm migrating a script from another language to Python. I watered this down on the specifics of the database calls etc... but this is what the file looks like. I intentionally made some queries fail as I was testing the transaction and it did not rollback() the queries executed prior to the forced error. I am a little confused as how to the transactions work with Python, the example I followed was this one, it was a loop with several queries nested within transactions so I adapted the code according to what I understood from it.
#!/usr/bin/python
import MySQLdb
import thread
import os
# Open database connection
# added local_infile=1 to allow the import to work, otherwise you get an error
db = MySQLdb.connect(CONNECTION ARGS...)
# define our function that will be called from our thread
def import_queued_file(conn,distid):
# prepare a cursor object using cursor() method
cursor = conn.cursor()
# total lines imported for all files for a distributor
total_lines_imported = 0
# current lines imported for each file on each iteration
current_lines_imported = 0
# default this to 0, this will have the total lines for our imports on each iteration
previous_lines_imported = 0
# initialize the file exists flag to 0
file_exists = 0
# sql statement to retrieve the file(s) for a specific distributor
sql = """
SELECT
...
FROM ...
WHERE ...
"""
# execute the sql statement
cursor.execute(sql)
# if we have records, execute the code below
if (cursor.rowcount > 0):
# set the records to the files variable
files = cursor.fetchall()
# set a variable to count iterations
# we'll use this to determine if we need to drop the table
cnt = 0
# keep track of the total number of lines imported per distributor (may be multiple files)
lines_imported = 0
# loop the recordset
for col in files:
# increment the cnt variable
cnt += 1
# set file_exists to 0 at the beginning of the iteration
file_exists = 0
# set some variables to be used in our sql load data statement
var1 = col[1]
var2 = col[2]
....
# this is the path of our file that we will be using for MySQL LOAD DATA also
# TODO: REFACTOR SO THAT THE /home/webex/backup/ IS NOT HARD CODED
inventoryfile = "/path/to/file/%s" % (filepath)
# check to see if we have a file
if (os.path.exists(inventoryfile)):
try:
# set file exists to true
file_exists = 1
# if cnt > 1, it means we have more than 1 file for this distributor
# only drop the table if this is the first iteration
if (cnt == 1):
# drop table sql statement
sql = "DROP TABLE IF EXISTS %s" % (temptable)
# execute the sql command
cur = conn.cursor()
cur.execute(sql)
cur.close()
# assign the create table statement to the sql variable
sql = """
CREATE TABLE IF NOT EXISTS
.......
.......
) ENGINE=MyISAM DEFAULT CHARSET=utf8
""" % (temptable)
# execute the sql statement
cur = conn.cursor()
cur.execute(sql)
cur.close()
# query the temptable to see if we have any records
sql = "SELECT COUNT(0) AS total FROM %s" % (temptable)
cur = conn.cursor()
cur.execute(sql)
cur.close()
# get the count of how many records exist in the database
number_of_line_items = cur.fetchall()
previous_lines_imported = number_of_line_items[0][0]
# load data local infile sql statement
sql = """
LOAD DATA LOCAL INFILE ...
"""
# execute the load data infile sql statement
cur = conn.cursor()
cur.execute(sql)
cur.close()
# clean up the table by removing...
# rows that don't have a part_number,
# rows that have part_number's less than 3 characters
sql = """
DELETE FROM ...
""" % (temptable)
# execute the delete query
cur = conn.cursor()
cur.execute(sql)
cur.close()
# query the temptable to see if we have any records after the import
sql = "SELECT COUNT(0) AS total FROM %s" % (temptable)
# execute the count query
cur = conn.cursor()
cur.execute(sql)
cur.close()
# get the count of how many records exist in the database after the import
number_of_line_items = cur.fetchall()
# get the current lines imported
current_lines_imported = number_of_line_items[0][0] - previous_lines_imported
# add the current lines imported to the total lines imported
total_lines_imported += current_lines_imported
# update distributor_file_settings table last_updated_on field
sql = """
UPDATE ...
""" % (file_id,distributor__id)
print sql
# execute the update query
cur = conn.cursor()
cur.execute(sql)
cur.close()
# close cursor
conn.commit()
except:
conn.rollback()
# no records exists for this distributor
else:
print "dist doesn't exist"
cursor.close()
import_queued_file(db,42)
# prepare a cursor object using cursor() method
cursor = db.cursor()
# select distinct file settings
sql = """
SELECT ...
"""
# disconnect from server
db.close()
After reviewing the code again and again, the issue happened to be the table type. After changing it to INNODB it worked as expected.