Not able handle "on_conflict_do_nothing" in Sqlalchemy in Mysql [duplicate] - python

Is there an elegant way to do an INSERT ... ON DUPLICATE KEY UPDATE in SQLAlchemy? I mean something with a syntax similar to inserter.insert().execute(list_of_dictionaries) ?

ON DUPLICATE KEY UPDATE post version-1.2 for MySQL
This functionality is now built into SQLAlchemy for MySQL only. somada141's answer below has the best solution:
https://stackoverflow.com/a/48373874/319066
ON DUPLICATE KEY UPDATE in the SQL statement
If you want the generated SQL to actually include ON DUPLICATE KEY UPDATE, the simplest way involves using a #compiles decorator.
The code (linked from a good thread on the subject on reddit) for an example can be found on github:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def append_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
if 'append_string' in insert.kwargs:
return s + " " + insert.kwargs['append_string']
return s
my_connection.execute(my_table.insert(append_string = 'ON DUPLICATE KEY UPDATE foo=foo'), my_values)
But note that in this approach, you have to manually create the append_string. You could probably change the append_string function so that it automatically changes the insert string into an insert with 'ON DUPLICATE KEY UPDATE' string, but I'm not going to do that here due to laziness.
ON DUPLICATE KEY UPDATE functionality within the ORM
SQLAlchemy does not provide an interface to ON DUPLICATE KEY UPDATE or MERGE or any other similar functionality in its ORM layer. Nevertheless, it has the session.merge() function that can replicate the functionality only if the key in question is a primary key.
session.merge(ModelObject) first checks if a row with the same primary key value exists by sending a SELECT query (or by looking it up locally). If it does, it sets a flag somewhere indicating that ModelObject is in the database already, and that SQLAlchemy should use an UPDATE query. Note that merge is quite a bit more complicated than this, but it replicates the functionality well with primary keys.
But what if you want ON DUPLICATE KEY UPDATE functionality with a non-primary key (for example, another unique key)? Unfortunately, SQLAlchemy doesn't have any such function. Instead, you have to create something that resembles Django's get_or_create(). Another StackOverflow answer covers it, and I'll just paste a modified, working version of it here for convenience.
def get_or_create(session, model, defaults=None, **kwargs):
instance = session.query(model).filter_by(**kwargs).first()
if instance:
return instance
else:
params = dict((k, v) for k, v in kwargs.iteritems() if not isinstance(v, ClauseElement))
if defaults:
params.update(defaults)
instance = model(**params)
return instance

I should mention that ever since the v1.2 release, the SQLAlchemy 'core' has a solution to the above with that's built in and can be seen under here (copied snippet below):
from sqlalchemy.dialects.mysql import insert
insert_stmt = insert(my_table).values(
id='some_existing_id',
data='inserted value')
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
data=insert_stmt.inserted.data,
status='U'
)
conn.execute(on_duplicate_key_stmt)

Based on phsource's answer, and for the specific use-case of using MySQL and completely overriding the data for the same key without performing a DELETE statement, one can use the following #compiles decorated insert expression:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def append_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
if insert.kwargs.get('on_duplicate_key_update'):
fields = s[s.find("(") + 1:s.find(")")].replace(" ", "").split(",")
generated_directive = ["{0}=VALUES({0})".format(field) for field in fields]
return s + " ON DUPLICATE KEY UPDATE " + ",".join(generated_directive)
return s

It's depends upon you. If you want to replace then pass OR REPLACE in prefixes
def bulk_insert(self,objects,table):
#table: Your table class and objects are list of dictionary [{col1:val1, col2:vale}]
for counter,row in enumerate(objects):
inserter = table.__table__.insert(prefixes=['OR IGNORE'], values=row)
try:
self.db.execute(inserter)
except Exception as E:
print E
if counter % 100 == 0:
self.db.commit()
self.db.commit()
Here commit interval can be changed to speed up or speed down

My way
import typing
from datetime import datetime
from sqlalchemy.dialects import mysql
class MyRepository:
def model(self):
return MySqlAlchemyModel
def upsert(self, data: typing.List[typing.Dict]):
if not data:
return
model = self.model()
if hasattr(model, 'created_at'):
for item in data:
item['created_at'] = datetime.now()
stmt = mysql.insert(getattr(model, '__table__')).values(data)
for_update = []
for k, v in data[0].items():
for_update.append(k)
dup = {k: getattr(stmt.inserted, k) for k in for_update}
stmt = stmt.on_duplicate_key_update(**dup)
self.db.session.execute(stmt)
self.db.session.commit()
Usage:
myrepo.upsert([
{
"field11": "value11",
"field21": "value21",
"field31": "value31",
},
{
"field12": "value12",
"field22": "value22",
"field32": "value32",
},
])

The other answers have this covered but figured I'd reference another good example for mysql I found in this gist. This also includes the use of LAST_INSERT_ID, which may be useful depending on your innodb auto increment settings and whether your table has a unique key. Lifting the code here for easy reference but please give the author a star if you find it useful.
from app import db
from sqlalchemy import func
from sqlalchemy.dialects.mysql import insert
def upsert(model, insert_dict):
"""model can be a db.Model or a table(), insert_dict should contain a primary or unique key."""
inserted = insert(model).values(**insert_dict)
upserted = inserted.on_duplicate_key_update(
id=func.LAST_INSERT_ID(model.id), **{k: inserted.inserted[k]
for k, v in insert_dict.items()})
res = db.engine.execute(upserted)
return res.lastrowid

ORM
use upset func based on on_duplicate_key_update
class Model():
__input_data__ = dict()
def __init__(self, **kwargs) -> None:
self.__input_data__ = kwargs
self.session = Session(engine)
def save(self):
self.session.add(self)
self.session.commit()
def upsert(self, *, ingore_keys = []):
column_keys = self.__table__.columns.keys()
udpate_data = dict()
for key in self.__input_data__.keys():
if key not in column_keys:
continue
else:
udpate_data[key] = self.__input_data__[key]
insert_stmt = insert(self.__table__).values(**udpate_data)
all_ignore_keys = ['id']
if isinstance(ingore_keys, list):
all_ignore_keys =[*all_ignore_keys, *ingore_keys]
else:
all_ignore_keys.append(ingore_keys)
udpate_columns = dict()
for key in self.__input_data__.keys():
if key not in column_keys or key in all_ignore_keys:
continue
else:
udpate_columns[key] = insert_stmt.inserted[key]
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
**udpate_columns
)
# self.session.add(self)
self.session.execute(on_duplicate_key_stmt)
self.session.commit()
class ManagerAssoc(ORM_Base, Model):
def __init__(self, **kwargs):
self.id = idWorker.get_id()
column_keys = self.__table__.columns.keys()
udpate_data = dict()
for key in kwargs.keys():
if key not in column_keys:
continue
else:
udpate_data[key] = kwargs[key]
ORM_Base.__init__(self, **udpate_data)
Model.__init__(self, **kwargs, id = self.id)
....
# you can call it as following:
manager_assoc.upsert()
manager.upsert(ingore_keys = ['manager_id'])

Got a simpler solution:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def replace_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
s = s.replace("INSERT INTO", "REPLACE INTO")
return s
my_connection.execute(my_table.insert(replace_string=""), my_values)

I just used plain sql as:
insert_stmt = "REPLACE INTO tablename (column1, column2) VALUES (:column_1_bind, :columnn_2_bind) "
session.execute(insert_stmt, data)

As none of these solutions seem all the elegant. A brute force way is to query to see if the row exists. If it does delete the row and then insert otherwise just insert. Obviously some overhead involved but it does not rely on modifying the raw sql and it works on non orm stuff.

Related

peewee + MySQL, How to create a custom field type that wraps SQL-built ins?

I'd like to create a custom UUID field in peewee (over MySQL).
In python, I'm using the UUID as a hexified string, e.g.:
uuid = '110e8400-e29b-11d4-a716-446655440000'
But I want to store it in the database to a column of type BINARY(16) to save space.
MySQL has built-in HEX() and UNHEX()methods to convert back and forth between a string and binary.
So my question is how do I tell peewee to generate SQL that uses a built-in function? Here's an idea for the code I want to work:
class UUIDField(Field):
db_field='binary(16)'
def db_value(self, value):
if value is not None:
uuid = value.translate(None, '-') # remove dashes
# HERE: How do I let peewee know I want to generate
# a SQL string of the form "UNHEX(uuid)"?
def python_value(self, value):
if value is not None:
# HERE: How do I let peewee know I want to generate
# a SQL string of the form "HEX(value)"?
Note that I'm specifically asking how to get peewee to wrap or unwrap a value in custom SQL. I realize I could probably do the value conversion entirely in python, but I'm looking for the more general-purpose SQL-based solution.
EDIT: For future reference, here is how I made it work doing the conversions in python. It doesn't answer the question though, so any ideas are appreciated!
import binascii
from peewee import *
db = MySQLDatabase(
'database',
fields={'binary(16)': 'BINARY(16)'} # map the field type
)
# this does the uuid conversion in python
class UUIDField(Field):
db_field='binary(16)'
def db_value(self, value):
if value is None: return None
value = value.translate(None, '-')
value = binascii.unhexlify(value)
return value
def python_value(self, value):
if value is None: return None
value = '{}-{}-{}-{}-{}'.format(
binascii.hexlify(value[0:4]),
binascii.hexlify(value[4:6]),
binascii.hexlify(value[6:8]),
binascii.hexlify(value[8:10]),
binascii.hexlify(value[10:16])
)
return value
Using a SelectQuery you can invoke internal SQL functions like so:
from peewee import SelectQuery
# this does the uuid conversion in python
class UUIDField(Field):
db_field = 'binary(16)'
def db_value(self, value):
if value is None: return None
value = value.translate(None, '-')
query = SelectQuery(self.model_class, fn.UNHEX(value).alias('unhex'))
result = query.first()
value = result.unhex
return value
def python_value(self, value):
if value is None: return None
query = SelectQuery(self.model_class, fn.HEX(value).alias('hex'))
result = query.first()
value = '{}-{}-{}-{}-{}'.format(
result.hex[0:8],
result.hex[8:12],
result.hex[12:16],
result.hex[16:20],
result.hex[20:32]
)
return value

More general way of generating PyODBC queries as a dict?

Here are my averagely general class methods for creating a dictionary from the result of database queries:
def make_schema_dict(self):
schema = [i[2] for i in self.cursor.tables()
if i[2].startswith('tbl_') or i[2].startswith('vw_')]
self.schema = {table: {'scheme': [row.column_name for row
in self.cursor.columns(table)]}
for table in schema}
def last_table_query_as_dict(self, table):
return {'data': [{col: row.__getattribute__(col) for col in self.schema[table]['scheme']
if col != 'RowNum'} for row in self.cursor.fetchall()]}
Unfortunately as you can see, there are many complications.
For example, when multiple tables are queried; some hackish lambdas are required to generate the resulting dictionary.
Can you think of some more general methods?
You should be able to use row.cursor_description to make this a lot simpler. This should get you a list of dictionaries for the results:
[{c[0]: v for (c, v) in zip(row.cursor_description, row)} for row in self.cursor.fetchall()]
A neat solution can be found in this thread: https://groups.google.com/forum/?fromgroups#!topic/pyodbc/BVIZBYGXNsk
The root of the idea being, subclass Connection to use a custom Cursor class, have the Cursor class automatically construct dicts for you. I'd call this a fancy pythonic solution. You could also just have an additional function fetchonedict() and extend the Cursor class rather than override so you could retain default behavior.
class ConnectionWrapper(object):
def __init__(self, cnxn):
self.cnxn = cnxn
def __getattr__(self, attr):
return getattr(self.cnxn, attr)
def cursor(self):
return CursorWrapper(self.cnxn.cursor())
class CursorWrapper(object):
def __init__(self, cursor):
self.cursor = cursor
def __getattr__(self, attr):
return getattr(self.cursor, attr)
def fetchone(self):
row = self.cursor.fetchone()
if not row:
return None
return dict((t[0], value) for t, value in zip (self.cursor.description, row))
Additionally, while not for PyODBC, check out this stackoverflow answer for links to DictCursor classes for MySQL and OurSQL if you need some inspiration for design.

SQLAlchemy versioning cares about class import order

I was following the guide here:
http://www.sqlalchemy.org/docs/orm/examples.html?highlight=versioning#versioned-objects
and have come across an issue. I have defined my relationships like:
generic_ticker = relation('MyClass', backref=backref("stuffs"))
with strings so it doesn't care about the import order of my model modules. This all works fine normally, but when I use the versioning meta I get the following error:
sqlalchemy.exc.InvalidRequestError: When initializing mapper Mapper|MyClass|stuffs, expression 'Trader' failed to locate a name ("name 'MyClass' is not defined"). If this is a class name, consider adding this relationship() to the class after both dependent classes have been defined.
I tracked down the error to:
File "/home/nick/workspace/gm3/gm3/lib/history_meta.py", line 90, in __init__
mapper = class_mapper(cls)
File "/home/nick/venv/tg2env/lib/python2.6/site-packages/sqlalchemy/orm/util.py", line 622, in class_mapper
mapper = mapper.compile()
class VersionedMeta(DeclarativeMeta):
def __init__(cls, classname, bases, dict_):
DeclarativeMeta.__init__(cls, classname, bases, dict_)
try:
mapper = class_mapper(cls)
_history_mapper(mapper)
except UnmappedClassError:
pass
I fixed the problem by putting the try: except stuff in a lambda and running them all after all the imports have happened. This works but seems a bit rubbish, any ideas of how to fix this is a better way?
Thanks!
Update
The problem is not actually about import order. The versioning example is designed such that mapper requires compilation in costructor of each versioned class. And compilation fails when related classes are not yet defined. In case of circular relations there is no way to make it working by changing definition order of mapped classes.
Update 2
As the above update states (I didn't know you could edit other people's posts on here :)) this is likely due to circular references. In which case may be someone will find my hack useful (I'm using it with turbogears) (Replace VersionedMeta and add in create_mappers global in history_meta)
create_mappers = []
class VersionedMeta(DeclarativeMeta):
def __init__(cls, classname, bases, dict_):
DeclarativeMeta.__init__(cls, classname, bases, dict_)
#I added this code in as it was crashing otherwise
def make_mapper():
try:
mapper = class_mapper(cls)
_history_mapper(mapper)
except UnmappedClassError:
pass
create_mappers.append(lambda: make_mapper())
Then you can do something like the following in your models __init__.py
# Import your model modules here.
from myproj.lib.history_meta import create_mappers
from myproj.model.misc import *
from myproj.model.actor import *
from myproj.model.stuff1 import *
from myproj.model.instrument import *
from myproj.model.stuff import *
#setup the history
[func() for func in create_mappers]
That way it create the mappers only after all the classes have been defined.
Update 3
Slightly unrelated but I came across a duplicate primary key error in some circumstances (committing 2 changes to the same object in one go). My workaround has been to add a new primary auto-incrementing key. Of course you can't have more than 1 with mysql so I had to de-primary key the existing stuff used to create the history table. Check out my overall code (including a hist_id and getting rid of the foreign key constraint):
"""Stolen from the offical sqlalchemy recpies
"""
from sqlalchemy.ext.declarative import DeclarativeMeta
from sqlalchemy.orm import mapper, class_mapper, attributes, object_mapper
from sqlalchemy.orm.exc import UnmappedClassError, UnmappedColumnError
from sqlalchemy import Table, Column, ForeignKeyConstraint, Integer
from sqlalchemy.orm.interfaces import SessionExtension
from sqlalchemy.orm.properties import RelationshipProperty
from sqlalchemy.types import DateTime
import datetime
from sqlalchemy.orm.session import Session
def col_references_table(col, table):
for fk in col.foreign_keys:
if fk.references(table):
return True
return False
def _history_mapper(local_mapper):
cls = local_mapper.class_
# set the "active_history" flag
# on on column-mapped attributes so that the old version
# of the info is always loaded (currently sets it on all attributes)
for prop in local_mapper.iterate_properties:
getattr(local_mapper.class_, prop.key).impl.active_history = True
super_mapper = local_mapper.inherits
super_history_mapper = getattr(cls, '__history_mapper__', None)
polymorphic_on = None
super_fks = []
if not super_mapper or local_mapper.local_table is not super_mapper.local_table:
cols = []
for column in local_mapper.local_table.c:
if column.name == 'version':
continue
col = column.copy()
col.unique = False
#don't auto increment stuff from the normal db
if col.autoincrement:
col.autoincrement = False
#sqllite falls over with auto incrementing keys if we have a composite key
if col.primary_key:
col.primary_key = False
if super_mapper and col_references_table(column, super_mapper.local_table):
super_fks.append((col.key, list(super_history_mapper.base_mapper.local_table.primary_key)[0]))
cols.append(col)
if column is local_mapper.polymorphic_on:
polymorphic_on = col
#if super_mapper:
# super_fks.append(('version', super_history_mapper.base_mapper.local_table.c.version))
cols.append(Column('hist_id', Integer, primary_key=True, autoincrement=True))
cols.append(Column('version', Integer))
cols.append(Column('changed', DateTime, default=datetime.datetime.now))
if super_fks:
cols.append(ForeignKeyConstraint(*zip(*super_fks)))
table = Table(local_mapper.local_table.name + '_history', local_mapper.local_table.metadata,
*cols, mysql_engine='InnoDB')
else:
# single table inheritance. take any additional columns that may have
# been added and add them to the history table.
for column in local_mapper.local_table.c:
if column.key not in super_history_mapper.local_table.c:
col = column.copy()
super_history_mapper.local_table.append_column(col)
table = None
if super_history_mapper:
bases = (super_history_mapper.class_,)
else:
bases = local_mapper.base_mapper.class_.__bases__
versioned_cls = type.__new__(type, "%sHistory" % cls.__name__, bases, {})
m = mapper(
versioned_cls,
table,
inherits=super_history_mapper,
polymorphic_on=polymorphic_on,
polymorphic_identity=local_mapper.polymorphic_identity
)
cls.__history_mapper__ = m
if not super_history_mapper:
cls.version = Column('version', Integer, default=1, nullable=False)
create_mappers = []
class VersionedMeta(DeclarativeMeta):
def __init__(cls, classname, bases, dict_):
DeclarativeMeta.__init__(cls, classname, bases, dict_)
#I added this code in as it was crashing otherwise
def make_mapper():
try:
mapper = class_mapper(cls)
_history_mapper(mapper)
except UnmappedClassError:
pass
create_mappers.append(lambda: make_mapper())
def versioned_objects(iter):
for obj in iter:
if hasattr(obj, '__history_mapper__'):
yield obj
def create_version(obj, session, deleted = False):
obj_mapper = object_mapper(obj)
history_mapper = obj.__history_mapper__
history_cls = history_mapper.class_
obj_state = attributes.instance_state(obj)
attr = {}
obj_changed = False
for om, hm in zip(obj_mapper.iterate_to_root(), history_mapper.iterate_to_root()):
if hm.single:
continue
for hist_col in hm.local_table.c:
if hist_col.key == 'version' or hist_col.key == 'changed' or hist_col.key == 'hist_id':
continue
obj_col = om.local_table.c[hist_col.key]
# get the value of the
# attribute based on the MapperProperty related to the
# mapped column. this will allow usage of MapperProperties
# that have a different keyname than that of the mapped column.
try:
prop = obj_mapper.get_property_by_column(obj_col)
except UnmappedColumnError:
# in the case of single table inheritance, there may be
# columns on the mapped table intended for the subclass only.
# the "unmapped" status of the subclass column on the
# base class is a feature of the declarative module as of sqla 0.5.2.
continue
# expired object attributes and also deferred cols might not be in the
# dict. force it to load no matter what by using getattr().
if prop.key not in obj_state.dict:
getattr(obj, prop.key)
a, u, d = attributes.get_history(obj, prop.key)
if d:
attr[hist_col.key] = d[0]
obj_changed = True
elif u:
attr[hist_col.key] = u[0]
else:
# if the attribute had no value.
attr[hist_col.key] = a[0]
obj_changed = True
if not obj_changed:
# not changed, but we have relationships. OK
# check those too
for prop in obj_mapper.iterate_properties:
if isinstance(prop, RelationshipProperty) and \
attributes.get_history(obj, prop.key).has_changes():
obj_changed = True
break
if not obj_changed and not deleted:
return
attr['version'] = obj.version
hist = history_cls()
for key, value in attr.iteritems():
setattr(hist, key, value)
obj.version += 1
session.add(hist)
class VersionedListener(SessionExtension):
def before_flush(self, session, flush_context, instances):
for obj in versioned_objects(session.dirty):
create_version(obj, session)
for obj in versioned_objects(session.deleted):
create_version(obj, session, deleted = True)
I fixed the problem by putting the try: except stuff in a lambda and
running them all after all the imports have happened.
Great!

Implementing sub-table (view into a table): designing class relationship

I'm using Python 3, but the question isn't really tied to the specific language.
I have class Table that implements a table with a primary key. An instance of that class contains the actual data (which is very large).
I want to allow users to create a sub-table by providing a filter for the rows of the Table. I don't want to copy the table, so I was planning to keep in the sub-table just the subset of the primary keys from the parent table.
Obviously, the sub-table is just a view into the parent table; it will change if the parent table changes, will become invalid if the parent table is destroyed, and will lose some of its rows if they are deleted from the parent table. [EDIT: to clarify, if parent table is changed, I don't care what happens to the sub-table; any behavior is fine.]
How should I connect the two classes? I was thinking of:
class Subtable(Table):
def __init__(self, table, filter_function):
# ...
My assumption was that Subtable keeps the interface of Table, except slightly overrides the inherited methods just to check if the row is in. Is this a good implementation?
The problem is, I'm not sure how to initialize the Subtable instance given that I don't want to copy the table object passed to it. Is it even possible?
Also I was thinking to give class Table an instance method that returns Subtable instance; but that creates a dependency of Table on Subtable, and I guess it's better to avoid?
I'm going to use the following (I omitted many methods such as sort, which work quite well in this arrangement; also omitted error handling):
class Table:
def __init__(self, *columns, pkey = None):
self.pkey = pkey
self.__columns = columns
self.__data = {}
def __contains__(self, key):
return key in self.__data
def __iter__(self):
for key in self.__order:
yield key
def __len__(self):
return len(self.__data)
def items(self):
for key in self.__order:
yield key, self.__data[key]
def insert(self, *unnamed, **named):
if len(unnamed) > 0:
row_dict = {}
for column_id, column in enumerate(self.__columns):
row_dict[column] = unnamed[column_id]
else:
row_dict = named
key = row_dict[self.pkey]
self.__data[key] = row_dict
class Subtable(Table):
def __init__(self, table, row_filter):
self.__order = []
self.__data = {}
for key, row in table.items():
if row_filter(row):
self.__data[key] = row
Essentially, I'm copying the primary keys only, and create references to the data tied to them. If a row in the parent table is destroyed, it will still exist in the sub-table. If a row is modified in the parent table, it is also modified in the sub-table. This is fine, since my requirements was "anything goes when parent table is modified".
If you see any issues with this design, let me know please.

Expressing multiple columns in berkeley db in python?

Say I have a simple table that contains username, firstname, lastname.
How do I express this in berkeley Db?
I'm currently using bsddb as the interface.
Cheers.
You have to pick one "column" as the key (must be unique; I imagine that would be "username" in your case) -- the only way searches will ever possibly happen. The other columns can be made to be the single string value of that key by any way you like, from pickling to simple joining with a character that's guaranteed to never occur in any of the columns, such as `\0' for many kind of "readable text strings".
If you need to be able to search by different keys you'll need other, supplementary and separate bsddb databases set up as "indices" into your main table -- it's lots of work, and there's lots of literature on the subject. (Alternatively, you move to a higher-abstraction technology, such as sqlite, which handles the indexing neatly on your behalf;-).
tl,dr: To express multiple columns in an ordered key value store like berkley db you need to learn about key composition. Look up my other answers about bsddb to learn more.
There is several ways to do that using ordered key/value store.
The simplest solution is to store documents as json values with a correct key.
Now you probably want to build index over those columns to retrieve documents without having to iterate over all the hashmap to find the correct object. For that you can use a secondaryDB that will build automatically the index for you. Or you can build the index yourself.
If you don't want to deal with key packing (and it's a good idea for starting up), you can take advantage of DB.set_bt_compare which will allow you to use cpickle, json or msgpack for both keys and values while still having an order that makes sens to create indices and doing queries. This is slower method but introduce the pattern of key composition.
To fully take advantage what ordered key is, you can make use of Cursor.set_range(key) to set the position of the db at the beginning of a query.
Another pattern, is called the EAV pattern stores tuples that follow the scheme (entity, attribute, value) and then you build various index by using permutation of that tuple. I learned this pattern studing datomic.
For less ressource hungry database, you will go the "static typed" way and store as much as possible of common information in the "metadata" table and split documents (which are really RDBMS tables) into their own hashmap.
To get you started here is an example database using bsddb (but you could build it using another ordered key/value store like wiredtiger or leveldb) that implements the EAV pattern. In this implementation I swap EAV for IKV which translates to Unique identifier, Key, Value. The overal result is that you have a fully indexed schema less document database. I think it's a good compromise between efficiency and ease-of-use.
import struct
from json import dumps
from json import loads
from bsddb3.db import DB
from bsddb3.db import DBEnv
from bsddb3.db import DB_BTREE
from bsddb3.db import DB_CREATE
from bsddb3.db import DB_INIT_MPOOL
from bsddb3.db import DB_LOG_AUTO_REMOVE
def pack(*values):
def __pack(value):
if type(value) is int:
return '1' + struct.pack('>q', value)
elif type(value) is str:
return '2' + struct.pack('>q', len(value)) + value
else:
data = dumps(value, encoding='utf-8')
return '3' + struct.pack('>q', len(data)) + data
return ''.join(map(__pack, values))
def unpack(packed):
kind = packed[0]
if kind == '1':
value = struct.unpack('>q', packed[1:9])[0]
packed = packed[9:]
elif kind == '2':
size = struct.unpack('>q', packed[1:9])[0]
value = packed[9:9+size]
packed = packed[size+9:]
else:
size = struct.unpack('>q', packed[1:9])[0]
value = loads(packed[9:9+size])
packed = packed[size+9:]
if packed:
values = unpack(packed)
values.insert(0, value)
else:
values = [value]
return values
class TupleSpace(object):
"""Generic database"""
def __init__(self, path):
self.env = DBEnv()
self.env.set_cache_max(10, 0)
self.env.set_cachesize(5, 0)
flags = (
DB_CREATE |
DB_INIT_MPOOL
)
self.env.log_set_config(DB_LOG_AUTO_REMOVE, True)
self.env.set_lg_max(1024 ** 3)
self.env.open(
path,
flags,
0
)
# create vertices and edges k/v stores
def new_store(name):
flags = DB_CREATE
elements = DB(self.env)
elements.open(
name,
None,
DB_BTREE,
flags,
0,
)
return elements
self.tuples = new_store('tuples')
self.index = new_store('index')
self.txn = None
def get(self, uid):
cursor = self.tuples.cursor()
def __get():
record = cursor.set_range(pack(uid, ''))
if not record:
return
key, value = record
while True:
other, key = unpack(key)
if other == uid:
value = unpack(value)[0]
yield key, value
record = cursor.next()
if record:
key, value = record
continue
else:
break
else:
break
tuples = dict(__get())
cursor.close()
return tuples
def add(self, uid, **properties):
for key, value in properties.items():
self.tuples.put(pack(uid, key), pack(value))
self.index.put(pack(key, value, uid), '')
def delete(self, uid):
# delete item from main table and index
cursor = self.tuples.cursor()
index = self.index.cursor()
record = cursor.set_range(pack(uid, ''))
if record:
key, value = record
else:
cursor.close()
raise Exception('not found')
while True:
other, key = unpack(key)
if other == uid:
# remove tuple from main index
cursor.delete()
# remove it from index
value = unpack(value)[0]
index.set(pack(key, value, uid))
index.delete()
# continue
record = cursor.next()
if record:
key, value = record
continue
else:
break
else:
break
cursor.close()
def update(self, uid, **properties):
self.delete(uid)
self.add(uid, **properties)
def close(self):
self.index.close()
self.tuples.close()
self.env.close()
def debug(self):
for key, value in self.tuples.items():
uid, key = unpack(key)
value = unpack(value)[0]
print(uid, key, value)
def query(self, key, value=''):
"""return `(key, value, uid)` tuples that where
`key` and `value` are expressed in the arguments"""
cursor = self.index.cursor()
match = (key, value) if value else (key,)
record = cursor.set_range(pack(key, value))
if not record:
cursor.close()
return
while True:
key, _ = record
other = unpack(key)
ok = reduce(
lambda previous, x: (cmp(*x) == 0) and previous,
zip(match, other),
True
)
if ok:
yield other
record = cursor.next()
if not record:
break
else:
break
cursor.close()
db = TupleSpace('tmp')
# you can use a tuple to store a counter
db.add(0, counter=0)
# And then have a procedure doing the required work
# to alaways have a fresh uid
def make_uid():
counter = db.get(0)
counter['counter'] += 1
return counter['counter']
amirouche = make_uid()
db.add(amirouche, username="amirouche", age=30)
print(db.get(amirouche))

Categories

Resources