Cassandra multiprocessing can't pickle _thread.lock objects - python

I tried to use Cassandra and multiprocessing to insert rows (dummy data) concurrently based on the examples in
http://www.datastax.com/dev/blog/datastax-python-driver-multiprocessing-example-for-improved-bulk-data-throughput
This is my code
class QueryManager(object):
concurrency = 100 # chosen to match the default in execute_concurrent_with_args
def __init__(self, session, process_count=None):
self.pool = Pool(processes=process_count, initializer=self._setup, initargs=(session,))
#classmethod
def _setup(cls, session):
cls.session = session
cls.prepared = cls.session.prepare("""
INSERT INTO test_table (key1, key2, key3, key4, key5) VALUES (?, ?, ?, ?, ?)
""")
def close_pool(self):
self.pool.close()
self.pool.join()
def get_results(self, params):
results = self.pool.map(_multiprocess_write, (params[n:n+self.concurrency] for n in range(0, len(params), self.concurrency)))
return list(itertools.chain(*results))
#classmethod
def _results_from_concurrent(cls, params):
return [results[1] for results in execute_concurrent_with_args(cls.session, cls.prepared, params)]
def _multiprocess_write(params):
return QueryManager._results_from_concurrent(params)
if __name__ == '__main__':
processes = 2
# connect cluster
cluster = Cluster(contact_points=['127.0.0.1'], port=9042)
session = cluster.connect()
# database name is a concatenation of client_id and system_id
keyspace_name = 'unit_test_0'
# drop keyspace if it already exists in a cluster
try:
session.execute("DROP KEYSPACE IF EXISTS " + keyspace_name)
except:
pass
create_keyspace_query = "CREATE KEYSPACE " + keyspace_name \
+ " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};"
session.execute(create_keyspace_query)
# use a session's keyspace
session.set_keyspace(keyspace_name)
# drop table if it already exists in the keyspace
try:
session.execute("DROP TABLE IF EXISTS " + "test_table")
except:
pass
# create a table for invoices in the keyspace
create_test_table = "CREATE TABLE test_table("
keys = "key1 text,\n" \
"key2 text,\n" \
"key3 text,\n" \
"key4 text,\n" \
"key5 text,\n"
create_invoice_table_query += keys
create_invoice_table_query += "PRIMARY KEY (key1))"
session.execute(create_test_table)
qm = QueryManager(session, processes)
params = list()
for row in range(100000):
key = 'test' + str(row)
params.append([key, 'test', 'test', 'test', 'test'])
start = time.time()
rows = qm.get_results(params)
delta = time.time() - start
log.info(fm('Cassandra inserts 100k dummy rows for ', delta, ' secs'))
when I executed the code, I got the following error
TypeError: can't pickle _thread.lock objects
which pointed at
self.pool = Pool(processes=process_count, initializer=self._setup, initargs=(session,))

That would suggest you're trying to serialize a lock over IPC boundaries. I think it might be because you're supplying a Session object as an argument to the worker initialization function. Make the init function create a new session in each worker process (see the "Session per Process" section in the blog post you cited).

I know this already has an answer, but I wanted to highlight a couple of changes in the cassandra-driver package that make this code still not work properly with python 3.7 and the 3.18.0 cassandra-driver package.
If you look at the blog post that is linked. The __init__ function doesn't pass in the session, but it passes a cluster object. Even the cluster cannot be sent as an initarg anymore because it contains a lock. You need to create it inside the def _setup(cls): classmethod.
Second, execute_concurrent_with_args returns a ResultSet now and that also cannot be serialized. The older version of the cassandra-driver package just returned a list of objects.
To fix the above code change these 2 sections:
First, the __init__ and _setup methods
def __init__(self, process_count=None):
self.pool = Pool(processes=process_count, initializer=self._setup)
#classmethod
def _setup(cls):
cluster = Cluster()
cls.session = cluster.connect()
cls.prepared = cls.session.prepare("""
INSERT INTO test_table (key1, key2, key3, key4, key5) VALUES (?, ?, ?, ?, ?)
""")
Second, the _results_from_concurrent method
#classmethod
def _results_from_concurrent(cls, params):
return [list(results[1]) for results in execute_concurrent_with_args(cls.session, cls.prepared, params)]
Lastly, if you are interested in a gist for the multiprocess_execute.py in the original DataStax blog post that works with python3 and cassandra-driver 3.18.0, you can find that here: https://gist.github.com/jWolo/6127b2e57c7e24740afd7a4254cc00a3

Related

How to read csv in parallel and write in Cassandra in parallel for achieving high throughput?

I have tried using execute, execute_async and execute_concurrent in Cassandra but for reading 10M rows, I could index them in Cassandra in no less than 55 mins. Note that I have had set the concurrent threads to 1000, tuned the YAML file's concurrent read and write limits as well (to 10000). I have tried replication factor 0, 1, 2 while creating the cluster. None could index the file in less time. So, I decided that instead of reading the csv sequentially, appending it in a list and then writing to Cassandra in batch, concurrent mode or in async mode, how about reading the CSV in parallel?! Hence, I used dask to read the csv file of 10M rows.
import json
import logging
from datetime import datetime
import dask.dataframe as dd
import dask.multiprocessing
import sys
import json
import pandas as pd
from cassandra import ConsistencyLevel, WriteTimeout
from cassandra.cluster import BatchStatement, Cluster
from cassandra.query import SimpleStatement
from cassandra.concurrent import execute_concurrent, execute_concurrent_with_args
class PythonCassandraExample:
def __init__(self, version):
self.cluster = None
self.session = None
self.keyspace = None
self.log = None
self.version = version
def __del__(self):
self.cluster.shutdown()
def createsession(self):
self.cluster = Cluster(['localhost'], connect_timeout=50)
self.session = self.cluster.connect(self.keyspace)
def getsession(self):
return self.session
# How about Adding some log info to see what went wrong
def setlogger(self):
log = logging.getLogger()
log.setLevel('INFO')
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
log.addHandler(handler)
self.log = log
# Create Keyspace based on Given Name
def handle_error(self, exception):
self.log.error("Failed to fetch user info: %s", exception)
def createkeyspace(self, keyspace):
"""
:param keyspace: The Name of Keyspace to be created
:return:
"""
# Before we create new lets check if exiting keyspace; we will drop that and create new
rows = self.session.execute(
"SELECT keyspace_name FROM system_schema.keyspaces")
if keyspace in [row[0] for row in rows]:
self.log.info("dropping existing keyspace...")
self.session.execute("DROP KEYSPACE " + keyspace)
self.log.info("creating keyspace...")
self.session.execute("""
CREATE KEYSPACE %s
WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '3' }
""" % keyspace)
self.log.info("setting keyspace...")
self.session.set_keyspace(keyspace)
def create_table(self, table_name):
self.table_name = table_name
c_sql = "CREATE TABLE IF NOT EXISTS {} (id varchar, version varchar, row varchar, PRIMARY KEY(id, version));".format(
self.table_name)
print("Query for creating table is: {}".format(c_sql))
self.session.execute(c_sql)
self.log.info("DP Table Created !!!")
self.insert_sql = self.session.prepare(
(
"INSERT INTO {} ({}, {}, {}) VALUES (?,?,?)"
).format(
self.table_name, "id", "version", "row"
)
)
# lets do some batch insert
def insert_data(self, key, version, row):
self.session.execute(
self.insert_sql, [key, version, row]
)
#dask.delayed
def print_a_block(self, d):
d = d.to_dict(orient='records')
for row in d:
key = str(row["0"])
row = json.dumps(row, default=str)
self.insert_data(key, self.version, row)
if __name__ == '__main__':
start_time = datetime.utcnow()
example1 = PythonCassandraExample(version="version_1")
example1.createsession()
example1.setlogger()
example1.createkeyspace('fri_athena_two')
example1.create_table('TenMillion')
example1.log.info("Calling compute!")
df = dd.read_csv("/Users/aviralsrivastava/dev/levelsdb-learning/10gb.csv")
dask.compute(*[example1.print_a_block(d) for d in df.to_delayed()])
print(datetime.utcnow() - start_time)
Even using dask, all efforts were in vain as an hour passed by and yet, the task of writing rows into Cassandra was yet not completed? What else should I do in order to decrease the time taken?

python cassandra get big result of select * in generator (without storage result in ram)

I want to get all data in cassandra table "user"
i have 840000 users and i don't want to get all users in python list.
i want get users in packs of 100 users
in cassandra doc https://datastax.github.io/python-driver/query_paging.html
i see i can use fetch_size, but in my python code i have database object that contains all cql instruction
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement
class Database:
def __init__(self, name, salary):
self.cluster = Cluster(['192.168.1.1', '192.168.1.2'])
self.session = cluster.connect()
def get_users(self):
users_list = []
query = "SELECT * FROM users"
statement = SimpleStatement(query, fetch_size=10)
for user_row in session.execute(statement):
users_list.append(user_row.name)
return users_list
actually get_users return very big list of user name
but i want to transform return get_users to a "generator"
i don't want get all users name in 1 list and 1 call of function get_users, but i want to have lot of call get_users and return list with only 100 users max every call function
for example :
list1 = database.get_users()
list2 = database.get_users()
...
listn = database.get_users()
list1 contains 100 first user in query
list2 contains 100 "second" users in query
listn contains the latest elements in query (<=100)
is this possible ?
thanks for advance for your answer
According to Paging Large Queries:
Whenever there are no more rows in the current page, the next page
will be fetched transparently.
So, if you execute your code like this, you will still the whole result set, but this is paged in a transparent manner.
In order to achieve what you need to use callbacks. You can also find some code sample on the link above.
I added below the full code for reference.
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement
from threading import Event
class PagedResultHandler(object):
def __init__(self, future):
self.error = None
self.finished_event = Event()
self.future = future
self.future.add_callbacks(
callback=self.handle_page,
errback=self.handle_error)
def handle_page(self, rows):
for row in rows:
process_row(row)
if self.future.has_more_pages:
self.future.start_fetching_next_page()
else:
self.finished_event.set()
def handle_error(self, exc):
self.error = exc
self.finished_event.set()
def process_row(user_row):
print user_row.name, user_row.age, user_row.email
cluster = Cluster()
session = cluster.connect()
query = "SELECT * FROM myschema.users"
statement = SimpleStatement(query, fetch_size=5)
future = session.execute_async(statement)
handler = PagedResultHandler(future)
handler.finished_event.wait()
if handler.error:
raise handler.error
cluster.shutdown()
Moving to next page is done in handle_page when start_fetching_next_page is called.
If you replace the if statement with self.finished_event.set() you will see that the iteration stops after the first 5 rows as defined in fetch_size

Why is Twisted's adbapi failing to recover data from within unittests?

Overview
Context
I am writing unit tests for some higher-order logic that depends on writing to an SQLite3 database. For this I am using twisted.trial.unittest and twisted.enterprise.adbapi.ConnectionPool.
Problem statement
I am able to create a persistent sqlite3 database and store data therein. Using sqlitebrowser, I am able to verify that the data has been persisted as expected.
The issue is that calls to t.e.a.ConnectionPool.run* (e.g.: runQuery) return an empty set of results, but only when called from within a TestCase.
Notes and significant details
The problem I am experiencing occurs only within Twisted's trial framework. My first attempt at debugging was to pull the database code out of the unit test and place it into an independent test/debug script. Said script works as expected while the unit test code does not (see examples below).
Case 1: misbehaving unit test
init.sql
This is the script used to initialize the database. There are no (apparent) errors stemming from this file.
CREATE TABLE ajxp_changes ( seq INTEGER PRIMARY KEY AUTOINCREMENT, node_id NUMERIC, type TEXT, source TEXT, target TEXT, deleted_md5 TEXT );
CREATE TABLE ajxp_index ( node_id INTEGER PRIMARY KEY AUTOINCREMENT, node_path TEXT, bytesize NUMERIC, md5 TEXT, mtime NUMERIC, stat_result BLOB);
CREATE TABLE ajxp_last_buffer ( id INTEGER PRIMARY KEY AUTOINCREMENT, type TEXT, location TEXT, source TEXT, target TEXT );
CREATE TABLE ajxp_node_status ("node_id" INTEGER PRIMARY KEY NOT NULL , "status" TEXT NOT NULL DEFAULT 'NEW', "detail" TEXT);
CREATE TABLE events (id INTEGER PRIMARY KEY AUTOINCREMENT, type text, message text, source text, target text, action text, status text, date text);
CREATE TRIGGER LOG_DELETE AFTER DELETE ON ajxp_index BEGIN INSERT INTO ajxp_changes (node_id,source,target,type,deleted_md5) VALUES (old.node_id, old.node_path, "NULL", "delete", old.md5); END;
CREATE TRIGGER LOG_INSERT AFTER INSERT ON ajxp_index BEGIN INSERT INTO ajxp_changes (node_id,source,target,type) VALUES (new.node_id, "NULL", new.node_path, "create"); END;
CREATE TRIGGER LOG_UPDATE_CONTENT AFTER UPDATE ON "ajxp_index" FOR EACH ROW BEGIN INSERT INTO "ajxp_changes" (node_id,source,target,type) VALUES (new.node_id, old.node_path, new.node_path, CASE WHEN old.node_path = new.node_path THEN "content" ELSE "path" END);END;
CREATE TRIGGER STATUS_DELETE AFTER DELETE ON "ajxp_index" BEGIN DELETE FROM ajxp_node_status WHERE node_id=old.node_id; END;
CREATE TRIGGER STATUS_INSERT AFTER INSERT ON "ajxp_index" BEGIN INSERT INTO ajxp_node_status (node_id) VALUES (new.node_id); END;
CREATE INDEX changes_node_id ON ajxp_changes( node_id );
CREATE INDEX changes_type ON ajxp_changes( type );
CREATE INDEX changes_node_source ON ajxp_changes( source );
CREATE INDEX index_node_id ON ajxp_index( node_id );
CREATE INDEX index_node_path ON ajxp_index( node_path );
CREATE INDEX index_bytesize ON ajxp_index( bytesize );
CREATE INDEX index_md5 ON ajxp_index( md5 );
CREATE INDEX node_status_status ON ajxp_node_status( status );
test_sqlite.py
This is the unit test class that fails unexpectedly. TestStateManagement.test_db_clean passes, indicated that the tables were properly created. TestStateManagement.test_inode_create fails, reporitng that zero results were retrieved.
import os.path as osp
from twisted.internet import defer
from twisted.enterprise import adbapi
import sqlengine # see below
class TestStateManagement(TestCase):
def setUp(self):
self.meta = mkdtemp()
self.db = adbapi.ConnectionPool(
"sqlite3", osp.join(self.meta, "db.sqlite"), check_same_thread=False,
)
self.stateman = sqlengine.StateManager(self.db)
with open("init.sql") as f:
script = f.read()
self.d = self.db.runInteraction(lambda c, s: c.executescript(s), script)
def tearDown(self):
self.db.close()
del self.db
del self.stateman
del self.d
rmtree(self.meta)
#defer.inlineCallbacks
def test_db_clean(self):
"""Canary test to ensure that the db is initialized in a blank state"""
yield self.d # wait for db to be initialized
q = "SELECT name FROM sqlite_master WHERE type='table' AND name=?;"
for table in ("ajxp_index", "ajxp_changes"):
res = yield self.db.runQuery(q, (table,))
self.assertTrue(
len(res) == 1,
"table {0} does not exist".format(table)
)
#defer.inlineCallbacks
def test_inode_create_file(self):
yield self.d
path = osp.join(self.ws, "test.txt")
with open(path, "wt") as f:
pass
inode = mk_dummy_inode(path)
yield self.stateman.create(inode, directory=False)
entry = yield self.db.runQuery("SELECT * FROM ajxp_index")
emsg = "got {0} results, expected 1. Are canary tests failing?"
lentry = len(entry)
self.assertTrue(lentry == 1, emsg.format(lentry))
sqlengine.py
These are the artefacts being tested by the above unit tests.
def values_as_tuple(d, *param):
"""Return the values for each key in `param` as a tuple"""
return tuple(map(d.get, param))
class StateManager:
"""Manages the SQLite database's state, ensuring that it reflects the state
of the filesystem.
"""
log = Logger()
def __init__(self, db):
self._db = db
def create(self, inode, directory=False):
params = values_as_tuple(
inode, "node_path", "bytesize", "md5", "mtime", "stat_result"
)
directive = (
"INSERT INTO ajxp_index (node_path,bytesize,md5,mtime,stat_result) "
"VALUES (?,?,?,?,?);"
)
return self._db.runOperation(directive, params)
Case 2: bug disappears outside of twisted.trial
#! /usr/bin/env python
import os.path as osp
from tempfile import mkdtemp
from twisted.enterprise import adbapi
from twisted.internet.task import react
from twisted.internet.defer import inlineCallbacks
INIT_FILE = "example.sql"
def values_as_tuple(d, *param):
"""Return the values for each key in `param` as a tuple"""
return tuple(map(d.get, param))
def create(db, inode):
params = values_as_tuple(
inode, "node_path", "bytesize", "md5", "mtime", "stat_result"
)
directive = (
"INSERT INTO ajxp_index (node_path,bytesize,md5,mtime,stat_result) "
"VALUES (?,?,?,?,?);"
)
return db.runOperation(directive, params)
def init_database(db):
with open(INIT_FILE) as f:
script = f.read()
return db.runInteraction(lambda c, s: c.executescript(s), script)
#react
#inlineCallbacks
def main(reactor):
meta = mkdtemp()
db = adbapi.ConnectionPool(
"sqlite3", osp.join(meta, "db.sqlite"), check_same_thread=False,
)
yield init_database(db)
# Let's make sure the tables were created as expected and that we're
# starting from a blank slate
res = yield db.runQuery("SELECT * FROM ajxp_index LIMIT 1")
assert not res, "database is not empty [ajxp_index]"
res = yield db.runQuery("SELECT * FROM ajxp_changes LIMIT 1")
assert not res, "database is not empty [ajxp_changes]"
# The details of this are not important. Suffice to say they (should)
# conform to the DB schema for ajxp_index.
test_data = {
"node_path": "/this/is/some/arbitrary/path.ext",
"bytesize": 0,
"mtime": 179273.0,
"stat_result": b"this simulates a blob of raw binary data",
"md5": "d41d8cd98f00b204e9800998ecf8427e", # arbitrary
}
# store the test data in the ajxp_index table
yield create(db, test_data)
# test if the entry exists in the db
entry = yield db.runQuery("SELECT * FROM ajxp_index")
assert len(entry) == 1, "got {0} results, expected 1".format(len(entry))
print("OK")
Closing remarks
Again, upon checking with sqlitebrowser, it seems as though the data is being written to db.sqlite, so this looks like a retrieval problem. From here, I'm sort of stumped... any ideas?
EDIT
This code will produce an inode that that can be used for testing.
def mk_dummy_inode(path, isdir=False):
return {
"node_path": path,
"bytesize": osp.getsize(path),
"mtime": osp.getmtime(path),
"stat_result": dumps(stat(path), protocol=4),
"md5": "directory" if isdir else "d41d8cd98f00b204e9800998ecf8427e",
}
Okay, it turns out that this is a bit of a tricky one. Running the tests in isolation (as was posted to this question) makes it such that the bug only rarely occurs. However, when running in the context of an entire test suite, it fails almost 100% of the time.
I added yield task.deferLater(reactor, .00001, lambda: None) after writing to the db and before reading from the db, and this solves the issue.
From there, I suspected this might be a race condition stemming from the connection pool and sqlite's limited concurrency-tolerance. I tried setting the cb_min and cb_max parameters to ConnectionPool to 1, and this also solved the issue.
In short: it seems as though sqlite doesn't play very nicely with multiple connections, and that the appropriate fix is to avoid concurrency to the extent possible.
If you take a look at your setUp function, you're returning self.db.runInteraction(...), which returns a deferred. As you've noted, you assume that it waits for the deferred to finish. However this is not the case and it's a trap that most fall victim to (myself included). I'll be honest with you, for situations like this, especially for unit tests, I just execute the synchronous code outside the TestCase class to initialize the database. For example:
def init_db():
import sqlite3
conn = sqlite3.connect('db.sqlite')
c = conn.cursor()
with open("init.sql") as f:
c.executescript(f.read())
init_db() # call outside test case
class TestStateManagement(TestCase):
"""
My test cases
"""
Alternatively, you could decorate the setup and yield runOperation(...) but something tells me that it wouldn't work... In any case, it's surprising that no errors were raised.
PS
I've been eyeballing this question for a while and it's been in the back of my head for days now. A potential reason for this finally dawned on me at nearly 1am. However, I'm too tired/lazy to actually test this out :D but it's a pretty damn good hunch. I'd like to commend you on your level of detail in this question.

Python Mocking db connection/unknown type in unit test

Newby to python here.
My class uses a database connection to wrap some functions. I have figured out some basic examples successfully. For the more complex library that I am working with, I cannot find close examples of mocking the database connection. In mine, the
class DBSAccess():
def __init__(self, db_con):
self.db_con = db_con
def get_db_perm(self, target_user):
## this is where I start having trouble
with self.db_con.cursor() as cursor:
cursor.execute("SELECT CAST(sum(maxperm) AS bigint) \
FROM dbc.diskspace \
WHERE databasename = '%s' \
GROUP BY databasename" % (target_user))
res = cursor.fetchone()
if res is not None:
return res[0]
else:
msg = target_user + " does not exist"
return msg
where db_con is a teradata.UdaExec returns a connection
udaExec = teradata.UdaExec (appName="whatever", version="1.0", logConsole=True)
db_con = udaExec.connect(method="odbc", system='my_sys', username='my_name', password='my_pswd')
dbc_instance = tdtestpy.DBSaccess (db_con)
So for my test to not use any real connection, I have to mock some things out. I tried this combination:
class DBAccessTest(unittest.TestCase):
def test_get_db_free_perm_expects_500(self):
uda_exec = mock.Mock(spec=teradata.UdaExec)
db_con = MagicMock(return_value=None)
db_con.cursor.fetchone.return_value = [500]
uda_exec.connect.return_value = db_con
self.dbc_instance = DBSAccess(db_con)
self.assertEqual(self.dbc_instance.get_db_free_perm("dbc"), 500)
but my result is messed up because fetchone is returning a mock, not the [500] one item list I was expecting:
AssertionError: <MagicMock name='mock.connect().cursor().[54 chars]312'> != 500
I've found some examples where there is a 'with block' for testing an OS operation, but nothing with database. Plus, I don't know what data type the db_con.cursor is so I can't spec that precisely - I think the cursor is found in UdaExecConnection.cursor() found at Teradata/PyTd.
I need to know how to mock the response that will allow me to test the logic within my method.
The source of your problem is in the following line:
with self.db_con.cursor() as cursor:
with lines calls __enter__ method, which generate in your case a new mock.
The solution is to mock __enter__ method:
db_con.cursor.return_value.__enter__.return_value = cursor
Your tests:
class DBAccessTest(unittest.TestCase):
def test_get_db_free_perm_expects_500(self):
db_con = MagicMock(UdaExecConnection)
cursor = MagicMock(UdaExecCursor)
cursor.fetchone.return_value = [500]
db_con.cursor.return_value.__enter__.return_value = cursor
self.dbc_instance = DBSAccess(db_con)
self.assertEqual(self.dbc_instance.get_db_perm("dbc"), 500)
def test_get_db_free_perm_expects_None(self):
db_con = MagicMock(UdaExecConnection)
cursor = MagicMock(UdaExecCursor)
cursor.fetchone.return_value = None
db_con.cursor.return_value.__enter__.return_value = cursor
self.dbc_instance = DBSAccess(db_con)
self.assertEqual(self.dbc_instance.get_db_perm("dbc"), "dbc does not exist")

cx_Oracle: How can I receive each row as a dictionary?

By default, cx_Oracle returns each row as a tuple.
>>> import cx_Oracle
>>> conn=cx_Oracle.connect('scott/tiger')
>>> curs=conn.cursor()
>>> curs.execute("select * from foo");
>>> curs.fetchone()
(33, 'blue')
How can I return each row as a dictionary?
You can override the cursor's rowfactory method. You will need to do this each time you perform the query.
Here's the results of the standard query, a tuple.
curs.execute('select * from foo')
curs.fetchone()
(33, 'blue')
Returning a named tuple:
def makeNamedTupleFactory(cursor):
columnNames = [d[0].lower() for d in cursor.description]
import collections
Row = collections.namedtuple('Row', columnNames)
return Row
curs.rowfactory = makeNamedTupleFactory(curs)
curs.fetchone()
Row(x=33, y='blue')
Returning a dictionary:
def makeDictFactory(cursor):
columnNames = [d[0] for d in cursor.description]
def createRow(*args):
return dict(zip(columnNames, args))
return createRow
curs.rowfactory = makeDictFactory(curs)
curs.fetchone()
{'Y': 'brown', 'X': 1}
Credit to Amaury Forgeot d'Arc:
http://sourceforge.net/p/cx-oracle/mailman/message/27145597
A very short version:
curs.rowfactory = lambda *args: dict(zip([d[0] for d in curs.description], args))
Tested on Python 3.7.0 & cx_Oracle 7.1.2
Old question but adding some helpful links with a Python recipe
According to cx_Oracle documentation:
Cursor.rowfactory
This read-write attribute specifies a method to call for each row that
is retrieved from the database. Ordinarily a tuple is returned for
each row but if this attribute is set, the method is called with the
tuple that would normally be returned, and the result of the method is
returned instead.
The cx_Oracle - Python Interface for Oracle Database Also points to GitHub repository for lots of helpful sample examples. Please check GenericRowFactory.py.
Googled: This PPT can be further helpful: [PDF]CON6543 Python and Oracle Database - RainFocus
Recipe
Django database backend for Oracle under the hood uses cx_Oracle. In earlier versions ( Django 1.11- ) they have written _rowfactory(cursor, row) That also cast cx_Oracle's numeric data types into relevant Python data and strings into unicode.
If you have installed Django Please check base.py as follows:
$ DJANGO_DIR="$(python -c 'import django, os; print(os.path.dirname(django.__file__))')"
$ vim $DJANGO_DIR/db/backends/oracle/base.py
One can borrow _rowfactory() from $DJANGO_DIR/db/backends/oracle/base.py and can apply below decorator naming to make it return namedtuple instead of simple tuple.
mybase.py
import functools
from itertools import izip, imap
from operator import itemgetter
from collections import namedtuple
import cx_Oracle as Database
import decimal
def naming(rename=False, case=None):
def decorator(rowfactory):
#functools.wraps(rowfactory)
def decorated_rowfactory(cursor, row, typename="GenericRow"):
field_names = imap(case, imap(itemgetter(0), cursor.description))
return namedtuple(typename, field_names)._make(rowfactory(cursor, row))
return decorated_rowfactory
return decorator
use it as:
#naming(rename=False, case=str.lower)
def rowfactory(cursor, row):
casted = []
....
....
return tuple(casted)
oracle.py
import cx_Oracle as Database
from cx_Oracle import *
import mybase
class Cursor(Database.Cursor):
def execute(self, statement, args=None):
prepareNested = (statement is not None and self.statement != statement)
result = super(self.__class__, self).execute(statement, args or [])
if prepareNested:
if self.description:
self.rowfactory = lambda *row: mybase.rowfactory(self, row)
return result
def close(self):
try:
super(self.__class__, self).close()
except Database.InterfaceError:
"already closed"
class Connection(Database.Connection):
def cursor(self):
Cursor(self)
connect = Connection
Now, instead of import cx_oracle import oracle in user script as:
user.py
import oracle
dsn = oracle.makedsn('HOSTNAME', 1521, service_name='dev_server')
db = connect('username', 'password', dsn)
cursor = db.cursor()
cursor.execute("""
SELECT 'Grijesh' as FirstName,
'Chauhan' as LastName,
CAST('10560.254' AS NUMBER(10, 2)) as Salary
FROM DUAL
""")
row = cursor.fetchone()
print ("First Name is %s" % row.firstname) # => Grijesh
print ("Last Name is %s" % row.lastname) # => Chauhan
print ("Salary is %r" % row.salary) # => Decimal('10560.25')
Give it a Try!!
Building up on answer by #maelcum73 :
curs.rowfactory = lambda *args: dict(zip([d[0] for d in curs.description], args))
The issue with this solution is that you need to re-set this after every execution.
Going one step further, you can create a shell around the cursor object like so:
class dictcur(object):
# need to monkeypatch the built-in execute function to always return a dict
def __init__(self, cursor):
self._original_cursor = cursor
def execute(self, *args, **kwargs):
# rowfactory needs to be set AFTER EACH execution!
self._original_cursor.execute(*args, **kwargs)
self._original_cursor.rowfactory = lambda *a: dict(
zip([d[0] for d in self._original_cursor.description], a)
)
# cx_Oracle's cursor's execute method returns a cursor object
# -> return the correct cursor in the monkeypatched version as well!
return self._original_cursor
def __getattr__(self, attr):
# anything other than the execute method: just go straight to the cursor
return getattr(self._original_cursor, attr)
dict_cursor = dictcur(cursor=conn.cursor())
Using this dict_cursor, every subsequent dict_cursor.execute() call will return a dictionary. Note: I tried monkeypatching the execute method directly, however that was not possible because it is a built-in method.

Categories

Resources