I have query in SQLAlchemy that is structurally the same as the MWE below, which I am using to query a Sqlite DB.
#!/usr/bin/env python3
from timeit import timeit
import os.path
from tqdm import tqdm
from pprint import pprint
from datetime import date, timedelta
from random import choice, randint
from sqlalchemy import (
Column, Integer, Date, String, ForeignKey, sql, orm, create_engine)
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
SAVE_INTERVAL = 365
START_DATE = date(2015, 1, 1)
DATE_RANGE = SAVE_INTERVAL * 5
NUM_AS = 10000
NAME_RANGE = NUM_AS // 4
NUM_BS = NUM_AS // 4
Base = declarative_base()
class A(Base):
__tablename__ = 'a'
id = Column(Integer, primary_key=True)
name = Column(String)
save_date = Column(Date)
def __init__(self, name, save_date):
self.name = name
self.save_date = save_date
bs = relationship('B', back_populates='a')
def __repr__(self):
return 'A({}, {})'.format(self.name,
self.save_date.strftime('%Y/%m/%d'))
class B(Base):
__tablename__ = 'b'
id = Column(Integer, primary_key=True)
a_id = Column(Integer, ForeignKey('a.id'))
# Relationships
a = relationship('A', back_populates='bs')
def __init__(self, a):
self.a = a
db_path = os.path.expanduser('~/test-query{}.db'.format(NUM_AS))
engine = create_engine('sqlite:///' + db_path)
Session = sessionmaker(bind=engine)
session = Session()
if not os.path.exists(db_path):
Base.metadata.create_all(engine)
print("Generating test data")
# Create dummy As and Bs data
a_list = []
for _ in tqdm(range(NUM_AS), "Generating As"):
a = A('name{:03}'.format(randint(1, NAME_RANGE)),
START_DATE + timedelta(days=randint(0, DATE_RANGE)))
a_list.append(a)
session.add(a)
session.commit()
for _ in tqdm(range(NUM_BS), "Generating Bs"):
session.add(B(choice(a_list)))
session.commit()
# Create query for sessions that still need to be reported
A_alias = orm.aliased(A)
query = (
session.query(A)
.filter(
# Filter out rows of the same name that have newer versions
~(session.query(A_alias.id)
.filter(
A_alias.name == A.name,
A_alias.save_date > A.save_date).exists()),
# Filter out rows that have older versions are referenced
# in table B that were saved less than SAVE_INTERVAL days earlier
~(session.query(A_alias.id)
.join(B) # Only select sessions with a report
.filter(
A_alias.name == A.name,
(sql.func.abs(
sql.func.julianday(A.save_date) -
sql.func.julianday(A_alias.save_date)) <=
SAVE_INTERVAL)).exists())))
print(query)
def count():
query.count()
print("Calculating exec time...")
print('{} s'.format(timeit(count, number=1)))
pprint("Number of returned records {}".format(count()))
It works fine, but is a bit slow on the production database (which is around 8,000 rows) making the web-page sluggish.
So I am wondering whether there is way I can speed it up, either by altering the SQL or how the ORM objects are generated?
Related
Here's my code / what I've tried. How can I query a json key containing a list?
import sqlalchemy
from sqlalchemy import Column, Integer, JSON
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Session = sessionmaker()
Base = declarative_base()
class Track(Base): # noqa: WPS230
__tablename__ = "track"
id = Column(Integer, primary_key=True)
fields = Column(JSON(none_as_null=True), default="{}")
def __init__(self, id):
self.id = id
self.fields = {}
engine = sqlalchemy.create_engine("sqlite:///:memory:")
Session.configure(bind=engine)
Base.metadata.create_all(engine) # creates tables
session = Session()
track1 = Track(id=1)
track2 = Track(id=2)
track1.fields["list"] = ["wow"]
track2.fields["list"] = ["wow", "more", "items"]
session.add(track1)
session.commit()
session.query(Track).filter(Track.fields["list"].as_string() == "wow").one()
session.query(Track).filter(Track.fields["list"].as_string() == "[wow]").one()
session.query(Track).filter(
Track.fields["list"].as_json() == ["wow", "more", "items"]
).one()
I've also tried contains() instead of ==, but that seems to match substrings of elements as well, which I don't want..
I managed to get the behavior I was after by utilizing json_each. To filter against the entire list, I just need to create a new json_each function for each element I want to test against.
#!/usr/bin/env python3
import sqlalchemy
from sqlalchemy import func
from sqlalchemy import Column, Integer, JSON
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Session = sessionmaker()
Base = declarative_base()
class Track(Base):
__tablename__ = "track"
id = Column(Integer, primary_key=True)
fields = Column(JSON, default="{}")
def __init__(self, id):
self.id = id
self.fields = {}
engine = sqlalchemy.create_engine("sqlite:///:memory:")
Session.configure(bind=engine)
Base.metadata.create_all(engine)
session = Session()
track = Track(id=1)
track.fields["list"] = ["a", "list"]
session.add(track)
session.commit()
fields_func1 = func.json_each(Track.fields, "$.list").table_valued(
"value", joins_implicitly=True
)
fields_func2 = func.json_each(Track.fields, "$.list").table_valued(
"value", joins_implicitly=True
)
session.query(Track).filter(fields_func1.c.value == "list").one()
session.query(Track).filter(fields_func1.c.value == "a").one()
session.query(Track).filter(
fields_func1.c.value == "a", fields_func2.c.value == "list"
).one()
How to parse a big file (50-100GB) into my database using SQLAlchemy? Let's say I have two tables.
import collections
import re
import Bio.SeqIO
import sqlalchemy
from sqlalchemy import ForeignKey, UniqueConstraint
from sqlalchemy import Column, Float, Integer, String, Text, DateTime
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql.expression import ClauseElement
Base = declarative_base()
class Protein_sequence(Base):
__tablename__ = 'protein_sequence'
prot_seq_id = Column(Integer, primary_key=True)
prot_seq = Column(Text, unique=True)
protein_annotation = relationship('Protein', back_populates='protein_sequence')
class Protein(Base):
__tablename__ = 'protein_annotation'
prot_id = Column(Integer, primary_key=True)
prot_seq_id = Column(Integer, ForeignKey('protein_sequence.prot_seq_id'))
prot_acc = Column(Text, unique=True)
prot_name = Column(Text)
protein_sequence = relationship('Protein_sequence', back_populates='protein_annotation')
def parse_fasta(path, prot_db='unknown', taxon_name=None, taxon_id=None):
"""Parsing a fasta file (UniProt or NCBInr)."""
prot = collections.OrderedDict()
for record in Bio.SeqIO.parse(path, 'fasta'):
prot['seq'] = str(record.seq)
desc = record.description
gi_num = re.findall('^gi\|([0-9]+)(?:\||\s|$)', desc)
if gi_num:
prot['prot_gi'] = gi_num[0]
desc = re.sub('^gi\|([0-9]+)(?:\||\s|$)', '', desc)
prot_db = re.findall('^([^|]+)\|', desc)
if prot_db:
prot_db = prot_db[0]
prot['prot_db'] = prot_db
prot_acc = re.findall('^[^|]+\|([^ ]+)', desc)[0]
prot['prot_acc'] = prot_acc
prot['prot_name'] = re.findall('^[^ ]+ (.+)', desc)[0]
yield prot
def prot_db_from_fasta():
"""Create tables in SQLite database. Input fasta file."""
db = 'sqlite:///proteomic.db'
engine = sqlalchemy.create_engine(db)
Base.metadata.create_all(engine)
Session = sqlalchemy.orm.sessionmaker(bind=engine)
session = Session()
conn = engine.connect()
p = 'prot.fasta'
for prot in parse_fasta(p):
# prot is dictionary storing info about protein
protein_sequence = Protein_sequence(prot_seq=prot['seq'])
session.add(protein_sequence)
try:
session.commit()
except:
session.rollback()
# choose only columns in table protein_annotation
cols = [c.name for c in Protein.__table__.columns]
annotation = {key: prot[key] for key in cols if key in prot}
annotation['prot_seq_id'] = protein_sequence.prot_seq_id
protein_annotation = Protein(**annotation)
session.add(protein_annotation)
session.commit()
conn.close()
# run function to insert data into database
prot_db_from_fasta()
The problem is that I need information about sequence ID (for annotation table) and in the same time insert sequence into database unless it is already there. Using SQLAlchemyCore will not help, the problem is I am using session commit in every for cycle. And it is very slow. If I use this script for 70MB big file, it takes 17 sec. If I use sqlite3 instead of SQLAlchemy it takes only 0.3 sec.
I know it is better to insert data in one big transaction but how to do it. I will not get back sequence ID to use them for my protein annotation.
There is also example of fasta file.
>gi|115646|sp|P02662.2|CASA1_BOVIN Alpha-S1-casein
MKLLILTCLVAVALARPKHPIKHQGLPQEVLNENLLRFFVAPFPEVFGKEKVNELSKDIGSESTEDQAME
DIKQMEAESISSSEEIVPNSVEQKHIQKEDVPSERYLGYLEQLLRLKKYKVPQLEIVPNSAEERLHSMKE
GIHAQQKEPMIGVNQELAYFYPELFRQFYQLDAYPSGAWYYVPLGTQYTDAPSFSDIPNPIGSENSEKTT
MPLW
>gi|115654|sp|P02663.2|CASA2_BOVIN Alpha-S2-casein
MKFFIFTCLLAVALAKNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRNANEEEYSIG
SSSEESAEVATEEVKITVDDKHYQKALNEINQFYQKFPQYLQYLYQGPIVLNPWDQVKRNAVPITPTLNR
EQLSTSEENSKKTVDMESTEVFTKKTKLTEEEKNRLNFLKKISQRYQKFALPQYLKTVYQHQKAMKPWIQ
PKTKVIPYVRYL
So what is good practice to insert data into database.
Picture of my proteomic database.
I am using this example to upload a csv file into a sqlite database:
this is my code:
from numpy import genfromtxt
from time import time
from datetime import datetime
from sqlalchemy import Column, Integer, Float, Date, String, VARCHAR
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
def Load_Data(file_name):
data = genfromtxt(file_name, delimiter=',')# skiprows=1, converters={0: lambda s: str(s)})
return data.tolist()
Base = declarative_base()
class cdb1(Base):
#Tell SQLAlchemy what the table name is and if there's any table-specific arguments it should know about
__tablename__ = 'cdb1'
__table_args__ = {'sqlite_autoincrement': True}
#tell SQLAlchemy the name of column and its attributes:
id = Column(Integer, primary_key=True, nullable=False)
name = Column(VARCHAR(40))
shack = Column(VARCHAR)
db = Column(Integer)
payments = Column(Integer)
status = Column(VARCHAR)
if __name__ == "__main__":
t = time()
print 'creating database'
#Create the database
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
#Create the session
session = sessionmaker()
session.configure(bind=engine)
s = session()
try:
file_name = 'client_db.csv'
data = Load_Data(file_name)
for i in data:
record = cdb1(**{
'name' : i[0],
'shack' : i[1],
'db' : i[2],
'payments' : i[3],
'status' : i[4]
})
s.add(record) #Add all the records
s.commit() #Attempt to commit all the records
except:
s.rollback() #Rollback the changes on error
print 'error in reading'
finally:
s.close() #Close the connection
print "Time elapsed: " + str(time() - t) + " s." #0.091s
and this is the first few rows of the csv file:
Name,Shack,DB,Payments,Status
Loyiso Dwala,I156,13542,37,LightsOnly ON
Attwell Fayo,I157,13077,32,LightsON
David Mbhele,G25,13155,33,LightsON
The DB is created ok, but only some of the data is captured into the attributes: the 'payments' and 'db' column are populated correctly, but everything else comes out as NULL.
UPDATED CORRECT CODE (using pandas dataframe):
from numpy import genfromtxt
from time import time
from datetime import datetime
from sqlalchemy import Column, Integer, Float, Date, String, VARCHAR
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import csv
import pandas as pd
#def Load_Data(file_name):
#data = csv.reader(file_name, delimiter=',')# skiprows=1, converters={0: lambda s: str(s)})
#return data.tolist()
Base = declarative_base()
class cdb1(Base):
#Tell SQLAlchemy what the table name is and if there's any table-specific arguments it should know about
__tablename__ = 'cdb1'
__table_args__ = {'sqlite_autoincrement': True}
#tell SQLAlchemy the name of column and its attributes:
id = Column(Integer, primary_key=True, nullable=False)
Name = Column(VARCHAR(40))
Shack = Column(VARCHAR)
DB = Column(Integer)
Payments = Column(Integer)
Status = Column(VARCHAR)
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
file_name = 'client_db.csv'
df = pd.read_csv(file_name)
df.to_sql(con=engine, index_label='id', name=cdb1.__tablename__, if_exists='replace')
Are you familiar with Pandas Dataframe?
Really simple to use (and debug)
pandas.read_csv(file_name)
In [5]: pandas.read_csv('/tmp/csvt.csv')
Out[5]:
Name Shack DB Payments Status
0 Loyiso Dwala I156 13542 37 LightsOnly ON
1 Attwell Fayo I157 13077 32 LightsON
2 David Mbhele G25 13155 33 LightsON
For inserting the DataFrames data into a table, you can simply use pandas.DataFrame.to_sql
So your main code will end up looking something like this:
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
file_name = 'client_db.csv'
df = pandas.read_csv(file_name)
df.to_sql(con=engine, index_label='id', name=cdb1.__tablename__, if_exists='replace')
You should read further in the documentation link I added, and set the function Parameters as suits your purpose (specially look at - if_exists, index, index_label, dtype)
I can't seem to find any good documentation on this. I have a list of users and order amounts, and I want to display the users with the top 10 order amount totals. I've been having trouble creating a query that sufficiently extracts this data in SQLAlchemy. Is there a better way to approach this?
customers, amount = DBSession.query(Order.customer, func.sum(Order.amount).label('totalamount')).\
group_by(Order.customer).\
order_by(func.desc(totalamount)).\
limit(10)
for a, b in zip(customers, amount):
print a.name, str(amount)
from sqlalchemy import *
from sqlalchemy.orm import *
from sqlalchemy.ext.declarative import declarative_base
import random
Base= declarative_base()
class Customer(Base):
__tablename__ = 'customer'
id = Column(Integer, primary_key=True)
name = Column(Unicode)
orders = relationship("Order", backref="customer")
class Order(Base):
__tablename__ = "order"
id = Column(Integer, primary_key=True)
customer_id= Column(Integer, ForeignKey('customer.id'))
amount = Column(Integer)
e = create_engine("sqlite://", echo=True)
Base.metadata.create_all(e)
session = Session(e)
session.add_all([
Customer(name="c%d" % i, orders=[
Order(amount=random.randint(10, 100))
for j in xrange(random.randint(0, 5))
])
for i in xrange(100)
])
amount_sum = func.sum(Order.amount).label('totalamount')
amount = session.query(Order.customer_id, amount_sum).\
group_by(Order.customer_id).\
order_by(amount_sum.desc()).\
limit(10).\
subquery()
for a, b in session.query(Customer, amount.c.totalamount).\
join(amount, amount.c.customer_id==Customer.id):
print a.name, b
some guidelines on the pattern here are at http://www.sqlalchemy.org/docs/orm/tutorial.html#using-subqueries, but overall start in SQL first.
I'm new to SQLAlchemy and relational databases, and I'm trying to set up a model for an annotated lexicon. I want to support an arbitrary number of key-value annotations for the words which can be added or removed at runtime. Since there will be a lot of repetition in the names of the keys, I don't want to use this solution directly, although the code is similar.
My design has word objects and property objects. The words and properties are stored in separate tables with a property_values table that links the two. Here's the code:
from sqlalchemy import Column, Integer, String, Table, create_engine
from sqlalchemy import MetaData, ForeignKey
from sqlalchemy.orm import relation, mapper, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///test.db', echo=True)
meta = MetaData(bind=engine)
property_values = Table('property_values', meta,
Column('word_id', Integer, ForeignKey('words.id')),
Column('property_id', Integer, ForeignKey('properties.id')),
Column('value', String(20))
)
words = Table('words', meta,
Column('id', Integer, primary_key=True),
Column('name', String(20)),
Column('freq', Integer)
)
properties = Table('properties', meta,
Column('id', Integer, primary_key=True),
Column('name', String(20), nullable=False, unique=True)
)
meta.create_all()
class Word(object):
def __init__(self, name, freq=1):
self.name = name
self.freq = freq
class Property(object):
def __init__(self, name):
self.name = name
mapper(Property, properties)
Now I'd like to be able to do the following:
Session = sessionmaker(bind=engine)
s = Session()
word = Word('foo', 42)
word['bar'] = 'yes' # or word.bar = 'yes' ?
s.add(word)
s.commit()
Ideally this should add 1|foo|42 to the words table, add 1|bar to the properties table, and add 1|1|yes to the property_values table. However, I don't have the right mappings and relations in place to make this happen. I get the sense from reading the documentation at http://www.sqlalchemy.org/docs/05/mappers.html#association-pattern that I want to use an association proxy or something of that sort here, but the syntax is unclear to me. I experimented with this:
mapper(Word, words, properties={
'properties': relation(Property, secondary=property_values)
})
but this mapper only fills in the foreign key values, and I need to fill in the other value as well. Any assistance would be greatly appreciated.
Simply use Dictionary-Based Collections mapping mapping - out of the box solution to your question. Extract from the link:
from sqlalchemy.orm.collections import column_mapped_collection, attribute_mapped_collection, mapped_collection
mapper(Item, items_table, properties={
# key by column
'notes': relation(Note, collection_class=column_mapped_collection(notes_table.c.keyword)),
# or named attribute
'notes2': relation(Note, collection_class=attribute_mapped_collection('keyword')),
# or any callable
'notes3': relation(Note, collection_class=mapped_collection(lambda entity: entity.a + entity.b))
})
# ...
item = Item()
item.notes['color'] = Note('color', 'blue')
print item.notes['color']
Or try the solution for Inserting data in Many to Many relationship in SQLAlchemy. Obviously you have to replace the list logic with the dict one.
Ask question author to post hist final code with associationproxy, which he mentioned he used in the end.
There is very similar question with slight interface difference. But it's easy to fix it by defining __getitem__, __setitem__ and __delitem__ methods.
Comment for Brent, above:
You can use session.flush() instead of commit() to get an id on your model instances. flush() will execute the necessary SQL, but will not commit, so you can rollback later if needed.
I ended up combining Denis and van's posts together to form the solution:
from sqlalchemy import Column, Integer, String, Table, create_engine
from sqlalchemy import MetaData, ForeignKey
from sqlalchemy.orm import relation, mapper, sessionmaker
from sqlalchemy.orm.collections import attribute_mapped_collection
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.ext.declarative import declarative_base
meta = MetaData()
Base = declarative_base(metadata=meta, name='Base')
class PropertyValue(Base):
__tablename__ = 'property_values'
WordID = Column(Integer, ForeignKey('words.id'), primary_key=True)
PropID = Column(Integer, ForeignKey('properties.id'), primary_key=True)
Value = Column(String(20))
def _property_for_name(prop_name):
return s.query(Property).filter_by(name=prop_name).first()
def _create_propval(prop_name, prop_val):
p = _property_for_name(prop_name)
if not p:
p = Property(prop_name)
s.add(p)
s.commit()
return PropertyValue(PropID=p.id, Value=prop_val)
class Word(Base):
__tablename__ = 'words'
id = Column(Integer, primary_key=True)
string = Column(String(20), nullable=False)
freq = Column(Integer)
_props = relation(PropertyValue, collection_class=attribute_mapped_collection('PropID'), cascade='all, delete-orphan')
props = association_proxy('_props', 'Value', creator=_create_propval)
def __init__(self, string, freq=1):
self.string = string
self.freq = freq
def __getitem__(self, prop):
p = _property_for_name(prop)
if p:
return self.props[p.id]
else:
return None
def __setitem__(self, prop, val):
self.props[prop] = val
def __delitem__(self, prop):
p = _property_for_name(prop)
if p:
del self.props[prop]
class Property(Base):
__tablename__ = 'properties'
id = Column(Integer, primary_key=True)
name = Column(String(20), nullable=False, unique=True)
def __init__(self, name):
self.name = name
engine = create_engine('sqlite:///test.db', echo=False)
Session = sessionmaker(bind=engine)
s = Session()
meta.create_all(engine)
The test code is as follows:
word = Word('foo', 42)
word['bar'] = "yes"
word['baz'] = "certainly"
s.add(word)
word2 = Word('quux', 20)
word2['bar'] = "nope"
word2['groink'] = "nope"
s.add(word2)
word2['groink'] = "uh-uh"
del word2['bar']
s.commit()
word = s.query(Word).filter_by(string="foo").first()
print word.freq, word['baz']
# prints 42 certainly
The contents of the databases are:
$ sqlite3 test.db "select * from property_values"
1|2|certainly
1|1|yes
2|3|uh-uh
$ sqlite3 test.db "select * from words"
1|foo|42
2|quux|20
$ sqlite3 test.db "select * from properties"
1|bar
2|baz
3|groink