Related
Is there an elegant way to do an INSERT ... ON DUPLICATE KEY UPDATE in SQLAlchemy? I mean something with a syntax similar to inserter.insert().execute(list_of_dictionaries) ?
ON DUPLICATE KEY UPDATE post version-1.2 for MySQL
This functionality is now built into SQLAlchemy for MySQL only. somada141's answer below has the best solution:
https://stackoverflow.com/a/48373874/319066
ON DUPLICATE KEY UPDATE in the SQL statement
If you want the generated SQL to actually include ON DUPLICATE KEY UPDATE, the simplest way involves using a #compiles decorator.
The code (linked from a good thread on the subject on reddit) for an example can be found on github:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def append_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
if 'append_string' in insert.kwargs:
return s + " " + insert.kwargs['append_string']
return s
my_connection.execute(my_table.insert(append_string = 'ON DUPLICATE KEY UPDATE foo=foo'), my_values)
But note that in this approach, you have to manually create the append_string. You could probably change the append_string function so that it automatically changes the insert string into an insert with 'ON DUPLICATE KEY UPDATE' string, but I'm not going to do that here due to laziness.
ON DUPLICATE KEY UPDATE functionality within the ORM
SQLAlchemy does not provide an interface to ON DUPLICATE KEY UPDATE or MERGE or any other similar functionality in its ORM layer. Nevertheless, it has the session.merge() function that can replicate the functionality only if the key in question is a primary key.
session.merge(ModelObject) first checks if a row with the same primary key value exists by sending a SELECT query (or by looking it up locally). If it does, it sets a flag somewhere indicating that ModelObject is in the database already, and that SQLAlchemy should use an UPDATE query. Note that merge is quite a bit more complicated than this, but it replicates the functionality well with primary keys.
But what if you want ON DUPLICATE KEY UPDATE functionality with a non-primary key (for example, another unique key)? Unfortunately, SQLAlchemy doesn't have any such function. Instead, you have to create something that resembles Django's get_or_create(). Another StackOverflow answer covers it, and I'll just paste a modified, working version of it here for convenience.
def get_or_create(session, model, defaults=None, **kwargs):
instance = session.query(model).filter_by(**kwargs).first()
if instance:
return instance
else:
params = dict((k, v) for k, v in kwargs.iteritems() if not isinstance(v, ClauseElement))
if defaults:
params.update(defaults)
instance = model(**params)
return instance
I should mention that ever since the v1.2 release, the SQLAlchemy 'core' has a solution to the above with that's built in and can be seen under here (copied snippet below):
from sqlalchemy.dialects.mysql import insert
insert_stmt = insert(my_table).values(
id='some_existing_id',
data='inserted value')
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
data=insert_stmt.inserted.data,
status='U'
)
conn.execute(on_duplicate_key_stmt)
Based on phsource's answer, and for the specific use-case of using MySQL and completely overriding the data for the same key without performing a DELETE statement, one can use the following #compiles decorated insert expression:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def append_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
if insert.kwargs.get('on_duplicate_key_update'):
fields = s[s.find("(") + 1:s.find(")")].replace(" ", "").split(",")
generated_directive = ["{0}=VALUES({0})".format(field) for field in fields]
return s + " ON DUPLICATE KEY UPDATE " + ",".join(generated_directive)
return s
It's depends upon you. If you want to replace then pass OR REPLACE in prefixes
def bulk_insert(self,objects,table):
#table: Your table class and objects are list of dictionary [{col1:val1, col2:vale}]
for counter,row in enumerate(objects):
inserter = table.__table__.insert(prefixes=['OR IGNORE'], values=row)
try:
self.db.execute(inserter)
except Exception as E:
print E
if counter % 100 == 0:
self.db.commit()
self.db.commit()
Here commit interval can be changed to speed up or speed down
My way
import typing
from datetime import datetime
from sqlalchemy.dialects import mysql
class MyRepository:
def model(self):
return MySqlAlchemyModel
def upsert(self, data: typing.List[typing.Dict]):
if not data:
return
model = self.model()
if hasattr(model, 'created_at'):
for item in data:
item['created_at'] = datetime.now()
stmt = mysql.insert(getattr(model, '__table__')).values(data)
for_update = []
for k, v in data[0].items():
for_update.append(k)
dup = {k: getattr(stmt.inserted, k) for k in for_update}
stmt = stmt.on_duplicate_key_update(**dup)
self.db.session.execute(stmt)
self.db.session.commit()
Usage:
myrepo.upsert([
{
"field11": "value11",
"field21": "value21",
"field31": "value31",
},
{
"field12": "value12",
"field22": "value22",
"field32": "value32",
},
])
The other answers have this covered but figured I'd reference another good example for mysql I found in this gist. This also includes the use of LAST_INSERT_ID, which may be useful depending on your innodb auto increment settings and whether your table has a unique key. Lifting the code here for easy reference but please give the author a star if you find it useful.
from app import db
from sqlalchemy import func
from sqlalchemy.dialects.mysql import insert
def upsert(model, insert_dict):
"""model can be a db.Model or a table(), insert_dict should contain a primary or unique key."""
inserted = insert(model).values(**insert_dict)
upserted = inserted.on_duplicate_key_update(
id=func.LAST_INSERT_ID(model.id), **{k: inserted.inserted[k]
for k, v in insert_dict.items()})
res = db.engine.execute(upserted)
return res.lastrowid
ORM
use upset func based on on_duplicate_key_update
class Model():
__input_data__ = dict()
def __init__(self, **kwargs) -> None:
self.__input_data__ = kwargs
self.session = Session(engine)
def save(self):
self.session.add(self)
self.session.commit()
def upsert(self, *, ingore_keys = []):
column_keys = self.__table__.columns.keys()
udpate_data = dict()
for key in self.__input_data__.keys():
if key not in column_keys:
continue
else:
udpate_data[key] = self.__input_data__[key]
insert_stmt = insert(self.__table__).values(**udpate_data)
all_ignore_keys = ['id']
if isinstance(ingore_keys, list):
all_ignore_keys =[*all_ignore_keys, *ingore_keys]
else:
all_ignore_keys.append(ingore_keys)
udpate_columns = dict()
for key in self.__input_data__.keys():
if key not in column_keys or key in all_ignore_keys:
continue
else:
udpate_columns[key] = insert_stmt.inserted[key]
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(
**udpate_columns
)
# self.session.add(self)
self.session.execute(on_duplicate_key_stmt)
self.session.commit()
class ManagerAssoc(ORM_Base, Model):
def __init__(self, **kwargs):
self.id = idWorker.get_id()
column_keys = self.__table__.columns.keys()
udpate_data = dict()
for key in kwargs.keys():
if key not in column_keys:
continue
else:
udpate_data[key] = kwargs[key]
ORM_Base.__init__(self, **udpate_data)
Model.__init__(self, **kwargs, id = self.id)
....
# you can call it as following:
manager_assoc.upsert()
manager.upsert(ingore_keys = ['manager_id'])
Got a simpler solution:
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import Insert
#compiles(Insert)
def replace_string(insert, compiler, **kw):
s = compiler.visit_insert(insert, **kw)
s = s.replace("INSERT INTO", "REPLACE INTO")
return s
my_connection.execute(my_table.insert(replace_string=""), my_values)
I just used plain sql as:
insert_stmt = "REPLACE INTO tablename (column1, column2) VALUES (:column_1_bind, :columnn_2_bind) "
session.execute(insert_stmt, data)
As none of these solutions seem all the elegant. A brute force way is to query to see if the row exists. If it does delete the row and then insert otherwise just insert. Obviously some overhead involved but it does not rely on modifying the raw sql and it works on non orm stuff.
I have the following code to insert documents into a MongoDB, the problem is that it's quite slow since I'm unable to multiprocessor it, and considering I have to check if each document inserted already exist or not I believe it's impossible to use bulk-inserts. I'm wondering if there is a faster method to this problem. After doing a profiling on below I found that check record() and update_upstream() are two functions that are very time consuming. So optimising them would increase the overall speed. Any inputs on how to optimise below would be highly appreciated. Thank you!
import os
import pymongo
from directory import Directory
from pymongo import ASCENDING
from pymongo import DESCENDING
from pymongo import MongoClient
from storage_config import StorageConfig
from tqdm import tqdm
dir = Directory()
def DB_collections(collection_type):
types = {'p': 'player_stats',
't': 'team_standings',
'f': 'fixture_stats',
'l': 'league_standings',
'pf': 'fixture_players_stats'}
return types.get(collection_type)
class DB():
def __init__(self, league, season, func=None):
self.db_user = os.environ.get('DB_user')
self.db_pass = os.environ.get('DB_pass')
self.MONGODB_URL = f'mongodb+srv://{self.db_user}:{self.db_pass}#cluster0-mbqxj.mongodb.net/<dbname>?retryWrites=true&w=majority'
self.league = league
self.season = str(season)
self.client = MongoClient(self.MONGODB_URL)
self.DATABASE = self.client[self.league + self.season]
self.pool = multiprocessing.cpu_count()
self.playerfile = f'{self.league}_{self.season}_playerstats.json'
self.teamfile = f'{self.league}_{self.season}_team_standings.json'
self.fixturefile = f'{self.league}_{self.season}_fixturestats.json'
self.leaguefile = f'{self.league}_{self.season}_league_standings.json'
self.player_fixture = f'{self.league}_{self.season}_player_fixture.json'
self.func = func
def execute(self):
if self.func is not None:
return self.func(self)
def import_json(file):
"""Imports a json file in read mode
Args:
file(str): Name of file
"""
return dir.load_json(file , StorageConfig.DB_DIR)
def load_file(file):
try:
loaded_file = import_json(file)
return loaded_file
except FileNotFoundError:
print("Please check that", file, "exists")
def check_record(collection, index_dict):
"""Check if record exists in collection
Args:
index_dict (dict): key, value
"""
return collection.find_one(index_dict)
def collection_index(collection, index, *args):
"""Checks if index exists for collection,
and return a new index if not
Args:
collection (str): Name of collection in database
index (str): Dict key to be used as an index
args (str): Additional dict keys to create compound indexs
"""
compound_index = tuple((arg, ASCENDING) for arg in args)
if index not in collection.index_information():
return collection.create_index([(index, DESCENDING), *compound_index], unique=True)
def push_upstream(collection, record):
"""Update record in collection
Args:
collection (str): Name of collection in database
record_id (str): record _id to be put for record in collection
record (dict): Data to be pushed in collection
"""
return collection.insert_one(record)
def update_upstream(collection, index_dict, record):
"""Update record in collection
Args:
collection (str): Name of collection in database
index_dict (dict): key, value
record (dict): Data to be updated in collection
"""
return collection.update_one(index_dict, {"$set": record}, upsert=True)
def executePushPlayer(db):
playerstats = load_file(db.playerfile)
collection_name = DB_collections('p')
collection = db.DATABASE[collection_name]
collection_index(collection, 'p_id')
for player in tqdm(playerstats):
existingPost = check_record(collection, {'p_id': player['p_id']})
if existingPost:
update_upstream(collection, {'p_id': player['p_id']}, player)
else:
push_upstream(collection, player)
if __name__ == '__main__':
db = DB('EN_PR', '2019')
executePushPlayer(db)
You can cobine the check/insert/update logic into a single update_one() command using upsert=True, then use the bulk operators with something like:
updates = []
for player in tqdm(playerstats):
updates.append(UpdateOne({'p_id': player['p_id']}, player, upsert=True))
collection.bulk_write(updates)
Fianlly, check your index is being used with the following command at the MongoDB shell:
db.mycollection.aggregate([{ $indexStats: {} }])
And review the accesses.ops metric.
I'd like to create a custom UUID field in peewee (over MySQL).
In python, I'm using the UUID as a hexified string, e.g.:
uuid = '110e8400-e29b-11d4-a716-446655440000'
But I want to store it in the database to a column of type BINARY(16) to save space.
MySQL has built-in HEX() and UNHEX()methods to convert back and forth between a string and binary.
So my question is how do I tell peewee to generate SQL that uses a built-in function? Here's an idea for the code I want to work:
class UUIDField(Field):
db_field='binary(16)'
def db_value(self, value):
if value is not None:
uuid = value.translate(None, '-') # remove dashes
# HERE: How do I let peewee know I want to generate
# a SQL string of the form "UNHEX(uuid)"?
def python_value(self, value):
if value is not None:
# HERE: How do I let peewee know I want to generate
# a SQL string of the form "HEX(value)"?
Note that I'm specifically asking how to get peewee to wrap or unwrap a value in custom SQL. I realize I could probably do the value conversion entirely in python, but I'm looking for the more general-purpose SQL-based solution.
EDIT: For future reference, here is how I made it work doing the conversions in python. It doesn't answer the question though, so any ideas are appreciated!
import binascii
from peewee import *
db = MySQLDatabase(
'database',
fields={'binary(16)': 'BINARY(16)'} # map the field type
)
# this does the uuid conversion in python
class UUIDField(Field):
db_field='binary(16)'
def db_value(self, value):
if value is None: return None
value = value.translate(None, '-')
value = binascii.unhexlify(value)
return value
def python_value(self, value):
if value is None: return None
value = '{}-{}-{}-{}-{}'.format(
binascii.hexlify(value[0:4]),
binascii.hexlify(value[4:6]),
binascii.hexlify(value[6:8]),
binascii.hexlify(value[8:10]),
binascii.hexlify(value[10:16])
)
return value
Using a SelectQuery you can invoke internal SQL functions like so:
from peewee import SelectQuery
# this does the uuid conversion in python
class UUIDField(Field):
db_field = 'binary(16)'
def db_value(self, value):
if value is None: return None
value = value.translate(None, '-')
query = SelectQuery(self.model_class, fn.UNHEX(value).alias('unhex'))
result = query.first()
value = result.unhex
return value
def python_value(self, value):
if value is None: return None
query = SelectQuery(self.model_class, fn.HEX(value).alias('hex'))
result = query.first()
value = '{}-{}-{}-{}-{}'.format(
result.hex[0:8],
result.hex[8:12],
result.hex[12:16],
result.hex[16:20],
result.hex[20:32]
)
return value
I am trying to use a JsonProperty with the google app engine datastore.
Using this code ( The indentation is right on my computer )
class JsonProperty(SerialProperty):
"""Stores a dictionary automatically encoding to JSON on set and decoding
on get.
"""
data_type = dict
def __init__(self, *args, **kwds):
self._require_parameter(kwds, 'indexed', False)
kwds['indexed'] = True
super(JsonProperty, self).__init__(*args, **kwds)
def get_value_for_datastore(self, model_instance):
"""Encodes the value to JSON."""
value = super(JsonProperty, self).get_value_for_datastore(
model_instance)
if value is not None:
return db.Text(json.dumps(value))
else:
return db.Text("{}")
def make_value_from_datastore(self, value):
value=super(JsonProperty, self).make_value_from_datastore(value)
logging.error(value)
"""Decodes the value from JSON."""
if value is not None:
return json.loads(value)
def validate(self, value):
if value is not None and not isinstance(value, (dict, list, tuple)):
raise db.BadValueError('Property %s must be a dict, list or '
'tuple.' % self.name)
But when I get the "value" in make_value_from_datastore, instead of being type db.Text, it is of type JsonProperty.
I can see the string in the datastore, and it is saving correctly.
When it is loaded, it chokes on json.loads(value), because json wants a "string or buffer"
I don't understand, and can't find any difference between my code and several examples on the internet, all of which indicate that the value of "value" should be the raw type that was saved to the datastore.
I'm guessing the value you get back is a db.Text value, rather than a string. Does this work?
def make_value_from_datastore(self, value):
value=super(JsonProperty, self).make_value_from_datastore(value)
logging.error(value)
"""Decodes the value from JSON."""
if value is not None:
return json.loads(str(value))
Say I have a simple table that contains username, firstname, lastname.
How do I express this in berkeley Db?
I'm currently using bsddb as the interface.
Cheers.
You have to pick one "column" as the key (must be unique; I imagine that would be "username" in your case) -- the only way searches will ever possibly happen. The other columns can be made to be the single string value of that key by any way you like, from pickling to simple joining with a character that's guaranteed to never occur in any of the columns, such as `\0' for many kind of "readable text strings".
If you need to be able to search by different keys you'll need other, supplementary and separate bsddb databases set up as "indices" into your main table -- it's lots of work, and there's lots of literature on the subject. (Alternatively, you move to a higher-abstraction technology, such as sqlite, which handles the indexing neatly on your behalf;-).
tl,dr: To express multiple columns in an ordered key value store like berkley db you need to learn about key composition. Look up my other answers about bsddb to learn more.
There is several ways to do that using ordered key/value store.
The simplest solution is to store documents as json values with a correct key.
Now you probably want to build index over those columns to retrieve documents without having to iterate over all the hashmap to find the correct object. For that you can use a secondaryDB that will build automatically the index for you. Or you can build the index yourself.
If you don't want to deal with key packing (and it's a good idea for starting up), you can take advantage of DB.set_bt_compare which will allow you to use cpickle, json or msgpack for both keys and values while still having an order that makes sens to create indices and doing queries. This is slower method but introduce the pattern of key composition.
To fully take advantage what ordered key is, you can make use of Cursor.set_range(key) to set the position of the db at the beginning of a query.
Another pattern, is called the EAV pattern stores tuples that follow the scheme (entity, attribute, value) and then you build various index by using permutation of that tuple. I learned this pattern studing datomic.
For less ressource hungry database, you will go the "static typed" way and store as much as possible of common information in the "metadata" table and split documents (which are really RDBMS tables) into their own hashmap.
To get you started here is an example database using bsddb (but you could build it using another ordered key/value store like wiredtiger or leveldb) that implements the EAV pattern. In this implementation I swap EAV for IKV which translates to Unique identifier, Key, Value. The overal result is that you have a fully indexed schema less document database. I think it's a good compromise between efficiency and ease-of-use.
import struct
from json import dumps
from json import loads
from bsddb3.db import DB
from bsddb3.db import DBEnv
from bsddb3.db import DB_BTREE
from bsddb3.db import DB_CREATE
from bsddb3.db import DB_INIT_MPOOL
from bsddb3.db import DB_LOG_AUTO_REMOVE
def pack(*values):
def __pack(value):
if type(value) is int:
return '1' + struct.pack('>q', value)
elif type(value) is str:
return '2' + struct.pack('>q', len(value)) + value
else:
data = dumps(value, encoding='utf-8')
return '3' + struct.pack('>q', len(data)) + data
return ''.join(map(__pack, values))
def unpack(packed):
kind = packed[0]
if kind == '1':
value = struct.unpack('>q', packed[1:9])[0]
packed = packed[9:]
elif kind == '2':
size = struct.unpack('>q', packed[1:9])[0]
value = packed[9:9+size]
packed = packed[size+9:]
else:
size = struct.unpack('>q', packed[1:9])[0]
value = loads(packed[9:9+size])
packed = packed[size+9:]
if packed:
values = unpack(packed)
values.insert(0, value)
else:
values = [value]
return values
class TupleSpace(object):
"""Generic database"""
def __init__(self, path):
self.env = DBEnv()
self.env.set_cache_max(10, 0)
self.env.set_cachesize(5, 0)
flags = (
DB_CREATE |
DB_INIT_MPOOL
)
self.env.log_set_config(DB_LOG_AUTO_REMOVE, True)
self.env.set_lg_max(1024 ** 3)
self.env.open(
path,
flags,
0
)
# create vertices and edges k/v stores
def new_store(name):
flags = DB_CREATE
elements = DB(self.env)
elements.open(
name,
None,
DB_BTREE,
flags,
0,
)
return elements
self.tuples = new_store('tuples')
self.index = new_store('index')
self.txn = None
def get(self, uid):
cursor = self.tuples.cursor()
def __get():
record = cursor.set_range(pack(uid, ''))
if not record:
return
key, value = record
while True:
other, key = unpack(key)
if other == uid:
value = unpack(value)[0]
yield key, value
record = cursor.next()
if record:
key, value = record
continue
else:
break
else:
break
tuples = dict(__get())
cursor.close()
return tuples
def add(self, uid, **properties):
for key, value in properties.items():
self.tuples.put(pack(uid, key), pack(value))
self.index.put(pack(key, value, uid), '')
def delete(self, uid):
# delete item from main table and index
cursor = self.tuples.cursor()
index = self.index.cursor()
record = cursor.set_range(pack(uid, ''))
if record:
key, value = record
else:
cursor.close()
raise Exception('not found')
while True:
other, key = unpack(key)
if other == uid:
# remove tuple from main index
cursor.delete()
# remove it from index
value = unpack(value)[0]
index.set(pack(key, value, uid))
index.delete()
# continue
record = cursor.next()
if record:
key, value = record
continue
else:
break
else:
break
cursor.close()
def update(self, uid, **properties):
self.delete(uid)
self.add(uid, **properties)
def close(self):
self.index.close()
self.tuples.close()
self.env.close()
def debug(self):
for key, value in self.tuples.items():
uid, key = unpack(key)
value = unpack(value)[0]
print(uid, key, value)
def query(self, key, value=''):
"""return `(key, value, uid)` tuples that where
`key` and `value` are expressed in the arguments"""
cursor = self.index.cursor()
match = (key, value) if value else (key,)
record = cursor.set_range(pack(key, value))
if not record:
cursor.close()
return
while True:
key, _ = record
other = unpack(key)
ok = reduce(
lambda previous, x: (cmp(*x) == 0) and previous,
zip(match, other),
True
)
if ok:
yield other
record = cursor.next()
if not record:
break
else:
break
cursor.close()
db = TupleSpace('tmp')
# you can use a tuple to store a counter
db.add(0, counter=0)
# And then have a procedure doing the required work
# to alaways have a fresh uid
def make_uid():
counter = db.get(0)
counter['counter'] += 1
return counter['counter']
amirouche = make_uid()
db.add(amirouche, username="amirouche", age=30)
print(db.get(amirouche))