I am trying to concurrently process insert/update into a redshift database using a python script on AWS glue. I am using the pg8000 library to do all my database operations. The concurrent insert/update fails with an error Error Name:1023 ,Error State:XX000). While researching the error I found out that the error was related to Serializable Isolation.
Can anyone look at the code and ensure that there would not be clashes while the insert/update happens?
I tried using a random sleep time within the calling class. it worked for a couple of cases but then as the number of workers increased. It failed for an insert/update case.
import sys
import time
import concurrent.futures
import pg8000
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['TempDir','JOB_NAME','REDSHIFT_HOST','REDSHIFT_PORT','REDSHIFT_DB','REDSHIFT_USER_NAME','REDSHIFT_USER_PASSWORD'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
job_run_id = args['JOB_RUN_ID']
maximum_workers = 5
def executeSql(sqlStmt):
conn = pg8000.connect(database=args['REDSHIFT_DB'],user=args['REDSHIFT_USER_NAME'],password=args['REDSHIFT_USER_PASSWORD'],host=args['REDSHIFT_HOST'],port=int(args['REDSHIFT_PORT']))
conn.autocommit = True
cur = conn.cursor()
cur.execute(sqlStmt)
cur.close()
conn.close()
def executeSqlProcedure(procedureName, procedureArgs = ""):
try:
logProcStrFormat = "CALL table_insert_proc('{}','{}','{}','{}',{},{})"
#Insert into the log table - create the record
executeSql (logProcStrFormat.format(job_run_id,procedureName,'pending','','getdate()','null')) #Code fails here
#Executing the procedure
procStrFormat = "CALL {}({})"
executeSql(procStrFormat.format(procedureName,procedureArgs))
print("Printing from {} process at ".format(procedureName),time.ctime())
#Update the record in log table to complete
executeSql (logProcStrFormat.format(job_run_id,procedureName,'complete','','null','getdate()')) #Code fails here
except Exception as e:
errorMsg = str(e.message["M"])
executeSql (logProcStrFormat.format(job_run_id,procedureName,'failure',errorMsg,'null','getdate()'))
raise
sys.exit(1)
def runDims():
dimProcedures = ["test_proc1","test_proc2","test_proc3","test_proc4","test_proc5"]
with concurrent.futures.ThreadPoolExecutor(max_workers=maximum_workers) as executor:
result = list(executor.map(executeSqlProcedure, dimProcedures))
def runFacts():
factProcedures = ["test_proc6","test_proc7","test_proc8","test_proc9"]
with concurrent.futures.ThreadPoolExecutor(max_workers=maximum_workers) as executor:
result = list(executor.map(executeSqlProcedure, factProcedures))
runDims()
runFacts()
I expect the insert/update to occur into the log table without locking/erroring out
Amazon Redshift does not work well with lots of small INSERT statements.
From Use a Multi-Row Insert - Amazon Redshift:
If a COPY command is not an option and you require SQL inserts, use a multi-row insert whenever possible. Data compression is inefficient when you add data only one row or a few rows at a time.
Multi-row inserts improve performance by batching up a series of inserts. The following example inserts three rows into a four-column table using a single INSERT statement. This is still a small insert, shown simply to illustrate the syntax of a multi-row insert.
insert into category_stage values
(default, default, default, default),
(20, default, 'Country', default),
(21, 'Concerts', 'Rock', default);
Alternatively, output the data to Amazon S3, then perform a bulk load using the COPY command. This will be much more efficient because it can perform the load in parallel across all nodes.
Related
i am having problems to load data into an access-database. For testing purpose i build a little convert functions which takes all data-sets from a hdf-file and writes it into the accdb. Without the #event.listens_for(engine, "before_cursor_execute") functionality it works, but veeery slow. With it, it creates an odd behavior. It creates only one empty table (from the first df) in the db and finishes execution. The for-loop will never be finished and no error raises.
Maybe it’s because the sqlalchemy-access package doesn’t support fast_executemany but couldn’t find any related information about it. Does any of you have some input for me how i can solve it or be able to write data in a faster way into the db?
big thanks!
import urllib
from pathlib import Path
from sqlalchemy import create_engine, event
# PATHS
HOME = Path(__file__).parent
DATA_DIR = HOME / 'output'
FILE_ACCESS = DATA_DIR / 'db.accdb'
FILE_HDF5 = DATA_DIR / 'Data.hdf'
# FUNCTIONS
def convert_from_hdf_to_accb():
# https://github.com/gordthompson/sqlalchemy-access/wiki/Getting-Connected
driver = '{Microsoft Access Driver (*.mdb, *.accdb)}'
conn_str = 'DRIVER={};DBQ={};'.format(driver, FILE_ACCESS)
conn_url = "access+pyodbc:///?odbc_connect={}".format(urllib.parse.quote_plus(conn_str))
# https://medium.com/analytics-vidhya/speed-up-bulk-inserts-to-sql-db-using-pandas-and-python-61707ae41990
# https://github.com/pandas-dev/pandas/issues/15276
# https://stackoverflow.com/questions/48006551/speeding-up-pandas-dataframe-to-sql-with-fast-executemany-of-pyodbc
engine = create_engine(conn_url)
#event.listens_for(engine, "before_cursor_execute")
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
if executemany:
cursor.fast_executemany = True
with pd.HDFStore(path=FILE_HDF5, mode="r") as store:
for key in store.keys():
df = store.get(key)
df.to_sql(name=key, con=engine, index=False, if_exists='replace')
print(' IT NEVER REACHES AND DOESNT RAISE AN ERROR :( ')
# EXECUTE
if __name__ == "__main__":
convert_from_hdf_to_accb()
Maybe it’s because the sqlalchemy-access package doesn’t support fast_executemany
That is true. pyodbc's fast_executemany feature requires that the driver support an internal ODBC mechanism called "parameter arrays", and the Microsoft Access ODBC driver does not support them.
See also
https://github.com/mkleehammer/pyodbc/wiki/Driver-support-for-fast_executemany
I need to embarrassingly parallel the fetch job for thousands of sql query from database.
Here is the simplified example.
##Env info: python=3.7 postgresql=10 dask=latest
##generate the example db table.
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
engine = create_engine('postgresql://dbadmin:dbadmin#server:5432/db01')
data = pd.DataFrame(np.random.randint(0,100 , size=(30000,5)),columns=['a','b','c','d','e'])
data.to_sql('tablename',engine,index=True,if_exists='append')
First, this is the basic example without dask parallel.
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
engine = create_engine('postgresql://dbadmin:dbadmin#server:5432/db01')
def job(indexstr):
'send the query, fetch the data, do some calculate and return'
sql='select * from public.tablename where index='+indexstr
df=pd.read_sql_query(sql, engine, index_col='index',)
##get the data and do some analysis.
return np.sum(df.values)
for v in range(1000):
lists.append(job(str(v)))
### wall time:17s
It's not as fast as we image since both the database query and data analysis might cost time and there are more idle cpu.
Then I try to use dask to parallel it like this.
def jobWithEngine(indexstr):
`engine cannot be serialized between processes thus create each own.`
engine = create_engine('postgresql://dbadmin:dbadmin#server:5432/db01')
sql='select * from public.tablename where index='+indexstr
df=pd.read_sql_query(sql, engine, index_col='index',)
return np.sum(df.values)
import dask
dask.config.set(scheduler='processes')
import dask.bag as db
dbdata=db.from_sequence([str(v) for v in range(1000)])
dbdata=dbdata.map(lambda x:jobWithEngine(x))
results_bag = dbdata.compute()
###Wall time:1min8s
The problem is that I find the engine creation takes more time and there are thousands of it.
It will be recreated in every sql query which is really costly and it might crash the database service!
So I guess it must be more elegant way like this:
import dask
dask.config.set(scheduler='processes')
import dask.bag as db
dbdata=db.from_sequence([str(v) for v in range(1000)])
dbdata=dbdata.map(lambda x:job(x,init=create_engine))
results_bag = dbdata.compute()
1.The main process create 8 sub process.
2.Each process create its own engine to initialize the job preparation.
3.Then main process send them 1000 jobs and get the 1000 return.
4.After all is done, sub process engine is stopped and kill the sub process.
Or the dask have already done this and the additional time comes from communications between process?
You can do this by setting a connected database as a variable for each worker using get_worker
from dask.distributed import get_worker
def connect_worker_db(db):
worker = get_worker()
worker.db = db # DB settings, password, username etc
worker.db.connect() # Function that connects the database, e.g. create_engine()
Then have the client run the connect_worker_db:
from dask.distributed import Client, get_worker
client = Client()
client.run(connect_worker_db, db)
For the function using the connection, like jobWithEngine(), you have to get the worker and use the parameter you have saved it to:
def jobWithEngine():
db = get_worker().db
Then make sure to disconnect at the end:
def disconnect_worker_db():
worker = get_worker()
worker.db.disconnect()
client.run(disconnect_worker_db)
Amy's answer has the benefit of being simple, but if for any reason dask starts new workers, they will not have .db.
I don't know when first introduced, but Dask 1.12.2 has a Client.register_worker_callbacks which takes a function as a parameter intended for this kind of use. If this callback takes a param called dask_worker then worker itself will be passed.
def main():
dask_client = dask.distributed.Client(cluster)
db = dict(
host="db-host",
username="user",
# etc etc
)
def worker_setup(dask_worker: dask.distributed.Worker):
dask_worker.db = db
dask_client.register_worker_callbacks(worker_setup)
https://distributed.dask.org/en/latest/api.html#distributed.Client.register_worker_callbacks
However, this doesn't close the db connections at the end. You probably will be covered with client.run(disconnect_worker_db) but I have seen some workers not releasing their resources. Fixing this in a more comprehensive manner needs a bit more code as per https://distributed.dask.org/en/latest/api.html#distributed.Client.register_worker_plugin
class MyWorkerPlugin(dask.distributed.WorkerPlugin):
def __init__(self, *args, **kwargs):
self.db = kwargs.get("db")
assert self.db, "no db"
def setup(self, worker: dask.distributed.Worker):
worker.db = self.db
def teardown(self, worker: dask.distributed.Worker):
print(f"worker {worker.name} teardown")
# eg db.disconnect()
def main():
cluster = dask.distributed.LocalCluster(
n_workers=os.cpu_count(),
threads_per_worker=2,
)
dask_client = dask.distributed.Client(cluster)
db = dict(
host="db-host",
username="user",
# etc etc
)
dask_client.register_worker_plugin(LGInferWorkerPlugin, "set-dbs", db=db)
dask_client.start()
You can give the plugin somewhat helpful names, and pass in kwargs to be used in the plugin's __init__.
I am setting up a new computer at work, and after installing anaconda and other various packages I have on my other computer, I am attempting to run some code that works fine on my other computer.
However, when trying to use SQLalchemy to import into redshift, I am getting a new error that I can't find anything on via google:
'SQLTable' object has no attribute 'insert_statement'
this appears to be some issue with padas.io.sql but I have no clue what
here is the code block:
import io
from pandas.io.sql import SQLTable
def _execute_insert(self, conn, keys, data_iter):
print("Using monkey-patched _execute_insert")
data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter]
conn.execute(self.insert_statement().values(data))
SQLTable._execute_insert = _execute_insert
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import text
dbschema='xref'
engine = create_engine('not_showing_you_this_part',
connect_args={'options': '-csearch_path={}'.format(dbschema)})
# test
from sqlalchemy import event, create_engine
#event.listens_for(engine, 'before_cursor_execute')
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
if executemany:
cursor.fast_executemany = True
cursor.commit()
# end test
api_start_time = time.time()
print('starting SQL query')
# change yh to the dataframe you want to upload
# under name = : enter in the name of the table you want to create or append to
df.to_sql(name='computer_test', con = engine, if_exists = 'append',index=False)
print('sql insert took: ' + str((time.time() - api_start_time)) + ' seconds')
for reference, the monkey-patch part is from:
How to speed up insertion from pandas.DataFrame .to_sql
full error in image
I kept on searching for the answer, only to see that a gentlemen had answered you question in the comment section.
I was using a very similar code for connecting and inserting to Redshift.
And the mistake I was committing was to use the below
conn.execute(self.insert_statement().values(data))
Replace the above with the code below:
conn.execute(self.table.insert().values(data))
Shoutout to https://stackoverflow.com/users/6560549/supershoot for answering it in the comments.
I'm trying to do some testing on our JDBC driver using Python.
Initially figuring out JPype, I eventually managed to connect the driver and execute select queries like so (reproducing a generalized snippet):
from __future__ import print_function
from jpype import *
#Start JVM, attach the driver jar
jvmpath = 'path/to/libjvm.so'
classpath = 'path/to/JDBC_Driver.jar'
startJVM(jvmpath, '-ea', '-Djava.class.path=' + classpath)
# Magic line 1
driver = JPackage('sql').Our_Driver
# Initiating a connection via DriverManager()
jdbc_uri = 'jdbc:our_database://localhost:port/database','user', 'passwd')
conn = java.sql.DriverManager.getConnection(jdbc_uri)
# Executing a statement
stmt = conn.createStatement()
rs = stmt.executeQuery ('select top 10 * from some_table')
# Extracting results
while rs.next():
''' Magic #2 - rs.getStuff() only works inside a while loop '''
print (rs.getString('col_name'))
However, I've failed to to batch inserts, which is what I wanted to test. Even when executeBatch() returned a jpype int[], which should indicate a successful insert, the table was not updated.
I then decided to try out py4j.
My plight - I'm having a hard time figuring out how to do the same thing as above. It is said py4j does not start a JVM on its own, and that the Java code needs to be prearranged with a GatewayServer(), so I'm not sure it's even feasible.
On the other hand, there's a library named py4jdbc that does just that.
I tinkered through the dbapi.py code but didn't quite understand the flow, and am pretty much jammed.
If anyone understands how to load a JDBC driver from a .jar file with py4j and can point me in the right direction, I'd be much grateful.
add a commit after adding the records and before retrieving.
conn.commit()
I have met a similar problem in airflow, I used teradata jdbc jars and jaydebeapi to connect teradata database and execute sql:
[root#myhost transfer]# cat test_conn.py
import jaydebeapi
from contextlib import closing
jclassname='com.teradata.jdbc.TeraDriver'
jdbc_driver_loc = '/opt/spark-2.3.1/jars/terajdbc4-16.20.00.06.jar,/opt/spark-2.3.1/jars/tdgssconfig-16.20.00.06.jar'
jdbc_driver_name = 'com.teradata.jdbc.TeraDriver'
host='my_teradata.address'
url='jdbc:teradata://' + host + '/TMODE=TERA'
login="teradata_user_name"
psw="teradata_passwd"
sql = "SELECT COUNT(*) FROM A_TERADATA_TABLE_NAME where month_key='202009'"
conn = jaydebeapi.connect(jclassname=jdbc_driver_name,
url=url,
driver_args=[login, psw],
jars=jdbc_driver_loc.split(","))
with closing(conn) as conn:
with closing(conn.cursor()) as cur:
cur.execute(sql)
print(cur.fetchall())
[root#myhost transfer]# python test_conn.py
[(7734133,)]
[root#myhost transfer]#
In py4j, with your respective JDBC uri:
from py4j.java_gateway import JavaGateway
# Open JVM interface with the JDBC Jar
jdbc_jar_path = '/path/to/jdbc_driver.jar'
gateway = JavaGateway.launch_gateway(classpath=jdbc_jar_path)
# Load the JDBC Jar
jdbc_class = "com.vendor.VendorJDBC"
gateway.jvm.class.forName(jdbc_class)
# Initiate connection
jdbc_uri = "jdbc://vendor:192.168.x.y:zzzz;..."
con = gateway.jvm.DriverManager.getConnection(jdbc_uri)
# Run a query
sql = "select this from that"
stmt = con.createStatement(sql)
rs = stmt.executeQuery()
while rs.next():
rs.getInt(1)
rs.getFloat(2)
.
.
rs.close()
stmt.close()
I'm having a hard time figuring it out how to develop the phase 3 of this algorithm:
Fetch data from a series of APIs
Store the data in the script until a certain condition is reached (cache and don't disturb the DB)
Push that structured data to a database AND at the same time continue with 1 (launch 1 without wait to complete the upload on the DB, the two things should go in parallel)
import requests
import time
from sqlalchemy import schema, types
from sqlalchemy.engine import create_engine
import threading
# I usually work on postgres
meta = schema.MetaData(schema="example")
# table one
table_api_one = schema.Table('api_one', meta,
schema.Column('id', types.Integer, primary_key=True),
schema.Column('field_one', types.Unicode(255), default=u''),
schema.Column('field_two', types.BigInteger()),
)
# table two
table_api_two = schema.Table('api_two', meta,
schema.Column('id', types.Integer, primary_key=True),
schema.Column('field_one', types.Unicode(255), default=u''),
schema.Column('field_two', types.BigInteger()),
)
# create tables
engine = create_engine("postgres://......", echo=False, pool_size=15, max_overflow=15)
meta.bind = engine
meta.create_all(checkfirst=True)
# get the data from the API and return data as JSON
def getdatafrom(url):
data = requests.get(url)
structured = data.json()
return structured
# push the data to the DB
def flush(list_one,list_two):
connection = engine.connect()
# both lists are list of json
connection.execute(table_api_one.insert(),list_one)
connection.execute(table_api_two.insert(),list_two)
connection.close()
# start doing something
def main():
timesleep = 30
flush_limit = 10
threading.Timer(timesleep * flush_limit, main).start()
data_api_one = []
data_api_two = []
# repeat the process 10 times (flush_limit) avoiding to keep to busy the DB
WHILE len(data_api_one) > flush_limit AND len(data_api_two) > flush_limit:
data_api_one.append(getdatafrom("http://www.apiurlone.com/api...").copy())
data_api_two.append(getdatafrom("http://www.apiurltwo.com/api...").copy())
time.sleep(timesleep)
# push the data when the limit is reached
flush(data_api_one,data_api_two)
# start the example
main()
In this example script, the thread is launched every 10 * 30 sec a main() (avoid overlapping the threads)
but, for this algorithm during the time of the flush() the script stop collecting the data from the APIs.
How it's possible to flush and keep getting the data from the APIs continuously?
thanks!
Usual approach is a Queue object (from module named Queue or queue, depending on Python version).
Create a producer function (running in one thread) which collects api data and when flushing puts it in the queue and a consumer function running in another thread waiting to get the data from the queue and store it to the database.