How to parse a big file (50-100GB) into my database using SQLAlchemy? Let's say I have two tables.
import collections
import re
import Bio.SeqIO
import sqlalchemy
from sqlalchemy import ForeignKey, UniqueConstraint
from sqlalchemy import Column, Float, Integer, String, Text, DateTime
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql.expression import ClauseElement
Base = declarative_base()
class Protein_sequence(Base):
__tablename__ = 'protein_sequence'
prot_seq_id = Column(Integer, primary_key=True)
prot_seq = Column(Text, unique=True)
protein_annotation = relationship('Protein', back_populates='protein_sequence')
class Protein(Base):
__tablename__ = 'protein_annotation'
prot_id = Column(Integer, primary_key=True)
prot_seq_id = Column(Integer, ForeignKey('protein_sequence.prot_seq_id'))
prot_acc = Column(Text, unique=True)
prot_name = Column(Text)
protein_sequence = relationship('Protein_sequence', back_populates='protein_annotation')
def parse_fasta(path, prot_db='unknown', taxon_name=None, taxon_id=None):
"""Parsing a fasta file (UniProt or NCBInr)."""
prot = collections.OrderedDict()
for record in Bio.SeqIO.parse(path, 'fasta'):
prot['seq'] = str(record.seq)
desc = record.description
gi_num = re.findall('^gi\|([0-9]+)(?:\||\s|$)', desc)
if gi_num:
prot['prot_gi'] = gi_num[0]
desc = re.sub('^gi\|([0-9]+)(?:\||\s|$)', '', desc)
prot_db = re.findall('^([^|]+)\|', desc)
if prot_db:
prot_db = prot_db[0]
prot['prot_db'] = prot_db
prot_acc = re.findall('^[^|]+\|([^ ]+)', desc)[0]
prot['prot_acc'] = prot_acc
prot['prot_name'] = re.findall('^[^ ]+ (.+)', desc)[0]
yield prot
def prot_db_from_fasta():
"""Create tables in SQLite database. Input fasta file."""
db = 'sqlite:///proteomic.db'
engine = sqlalchemy.create_engine(db)
Base.metadata.create_all(engine)
Session = sqlalchemy.orm.sessionmaker(bind=engine)
session = Session()
conn = engine.connect()
p = 'prot.fasta'
for prot in parse_fasta(p):
# prot is dictionary storing info about protein
protein_sequence = Protein_sequence(prot_seq=prot['seq'])
session.add(protein_sequence)
try:
session.commit()
except:
session.rollback()
# choose only columns in table protein_annotation
cols = [c.name for c in Protein.__table__.columns]
annotation = {key: prot[key] for key in cols if key in prot}
annotation['prot_seq_id'] = protein_sequence.prot_seq_id
protein_annotation = Protein(**annotation)
session.add(protein_annotation)
session.commit()
conn.close()
# run function to insert data into database
prot_db_from_fasta()
The problem is that I need information about sequence ID (for annotation table) and in the same time insert sequence into database unless it is already there. Using SQLAlchemyCore will not help, the problem is I am using session commit in every for cycle. And it is very slow. If I use this script for 70MB big file, it takes 17 sec. If I use sqlite3 instead of SQLAlchemy it takes only 0.3 sec.
I know it is better to insert data in one big transaction but how to do it. I will not get back sequence ID to use them for my protein annotation.
There is also example of fasta file.
>gi|115646|sp|P02662.2|CASA1_BOVIN Alpha-S1-casein
MKLLILTCLVAVALARPKHPIKHQGLPQEVLNENLLRFFVAPFPEVFGKEKVNELSKDIGSESTEDQAME
DIKQMEAESISSSEEIVPNSVEQKHIQKEDVPSERYLGYLEQLLRLKKYKVPQLEIVPNSAEERLHSMKE
GIHAQQKEPMIGVNQELAYFYPELFRQFYQLDAYPSGAWYYVPLGTQYTDAPSFSDIPNPIGSENSEKTT
MPLW
>gi|115654|sp|P02663.2|CASA2_BOVIN Alpha-S2-casein
MKFFIFTCLLAVALAKNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRNANEEEYSIG
SSSEESAEVATEEVKITVDDKHYQKALNEINQFYQKFPQYLQYLYQGPIVLNPWDQVKRNAVPITPTLNR
EQLSTSEENSKKTVDMESTEVFTKKTKLTEEEKNRLNFLKKISQRYQKFALPQYLKTVYQHQKAMKPWIQ
PKTKVIPYVRYL
So what is good practice to insert data into database.
Picture of my proteomic database.
Related
For the back-end, attached you will be able to find an XML file, where you need:
To create a parser in nodejs/php/python to read xml
Creating a MYSQL database (schema) saves the xml data
To use ORM to communicate with the database
The script should handle insert/update/delete
No need for frontend (CLI is enough)
I tried to solve this in python but im stuck at the function i need to create to store data from XML to Database table.
# import xml element tree
import xml.etree.ElementTree as ET
# import mysql connector
import mysql.connector
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
# give the connection parameters
# user name is root
# password is 2634687
# server is localhost
database = mysql.connector.connect(user='root', password='2634687', host='localhost')
# reading xml file , file name is dataset.xml
tree = ET.parse('dataset.xml')
# creating the cursor object
c = database.cursor()
# c.execute("CREATE DATABASE testingdb")
# print("testdb Data base is created")
# Connect to a MySQL database
engine = create_engine('mysql+pymysql://root:2634687#localhost/testingdb', echo=True)
# Define the Product model
Base = declarative_base()
class Product(Base):
__tablename__ = 'Product'
productId = Column(Integer, primary_key=True)
cedi = Column(String(100))
childWeightFrom = Column(String(100))
childWeightTo = Column(Integer)
color_code = Column(Integer)
color_description = Column(String(100))
countryImages = Column(String(100))
defaultSku = Column(String(100))
preferredEan = Column(Integer)
sapAssortmentLevel = Column(String(100))
sapPrice = Column(Integer)
season = Column(String(100))
showOnLineSku = Column(String(100))
size_code = Column(String(100))
size_description = Column(String(100))
skuID = Column(Integer)
skuName = Column(String(100))
stateOfArticle = Column(String(100))
umSAPprice = Column(String(10))
volume = Column(Integer)
weight = Column(Integer)
# Create the users table
Base.metadata.create_all(engine)
# Create a session to interact with the database
Session = sessionmaker(bind=engine)
session = Session()
# Insert a new product
new_product = Product(cedi='CD01')
session.add(new_product)
session.commit()
# Update the user's age
Product = session.query(Product).filter_by(cedi='CD01').first()
Product.childWeightFrom = 31
session.commit()
# Delete the user
session.delete(Product)
session.commit()
I have query in SQLAlchemy that is structurally the same as the MWE below, which I am using to query a Sqlite DB.
#!/usr/bin/env python3
from timeit import timeit
import os.path
from tqdm import tqdm
from pprint import pprint
from datetime import date, timedelta
from random import choice, randint
from sqlalchemy import (
Column, Integer, Date, String, ForeignKey, sql, orm, create_engine)
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
SAVE_INTERVAL = 365
START_DATE = date(2015, 1, 1)
DATE_RANGE = SAVE_INTERVAL * 5
NUM_AS = 10000
NAME_RANGE = NUM_AS // 4
NUM_BS = NUM_AS // 4
Base = declarative_base()
class A(Base):
__tablename__ = 'a'
id = Column(Integer, primary_key=True)
name = Column(String)
save_date = Column(Date)
def __init__(self, name, save_date):
self.name = name
self.save_date = save_date
bs = relationship('B', back_populates='a')
def __repr__(self):
return 'A({}, {})'.format(self.name,
self.save_date.strftime('%Y/%m/%d'))
class B(Base):
__tablename__ = 'b'
id = Column(Integer, primary_key=True)
a_id = Column(Integer, ForeignKey('a.id'))
# Relationships
a = relationship('A', back_populates='bs')
def __init__(self, a):
self.a = a
db_path = os.path.expanduser('~/test-query{}.db'.format(NUM_AS))
engine = create_engine('sqlite:///' + db_path)
Session = sessionmaker(bind=engine)
session = Session()
if not os.path.exists(db_path):
Base.metadata.create_all(engine)
print("Generating test data")
# Create dummy As and Bs data
a_list = []
for _ in tqdm(range(NUM_AS), "Generating As"):
a = A('name{:03}'.format(randint(1, NAME_RANGE)),
START_DATE + timedelta(days=randint(0, DATE_RANGE)))
a_list.append(a)
session.add(a)
session.commit()
for _ in tqdm(range(NUM_BS), "Generating Bs"):
session.add(B(choice(a_list)))
session.commit()
# Create query for sessions that still need to be reported
A_alias = orm.aliased(A)
query = (
session.query(A)
.filter(
# Filter out rows of the same name that have newer versions
~(session.query(A_alias.id)
.filter(
A_alias.name == A.name,
A_alias.save_date > A.save_date).exists()),
# Filter out rows that have older versions are referenced
# in table B that were saved less than SAVE_INTERVAL days earlier
~(session.query(A_alias.id)
.join(B) # Only select sessions with a report
.filter(
A_alias.name == A.name,
(sql.func.abs(
sql.func.julianday(A.save_date) -
sql.func.julianday(A_alias.save_date)) <=
SAVE_INTERVAL)).exists())))
print(query)
def count():
query.count()
print("Calculating exec time...")
print('{} s'.format(timeit(count, number=1)))
pprint("Number of returned records {}".format(count()))
It works fine, but is a bit slow on the production database (which is around 8,000 rows) making the web-page sluggish.
So I am wondering whether there is way I can speed it up, either by altering the SQL or how the ORM objects are generated?
Here is an absurd problem with sqlalchemy that seems easy! First, this is my config file for connecting to mysql database:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
engine = create_engine('mysql://root:#localhost:3306/digi')
and then, I am trying to create a table called 'sale-history' :
from config import *
from sqlalchemy import *
class Sale(Base):
__tablename__ = 'sale-history'
order_id = column(Integer, primary_key= True)
customer_id = column(Integer)
item_id = column(Integer) #froeign key with product list
cartFinalize_dateTime = column(DATETIME)
amount_ordrered = column(Integer)
city_name = column(String(191))
quantity_ordered = column(Integer)
def __repr__(self):
return "<Sale(city_name='%s')>" % (self.city_name)
Sale.__table__
Base.metadata.create_all(engine)
Now, what I wonder is that
Sale.__table__
and
Base.metadata.create_all(engine)
are not known to my code. More accurate, these are not in suggestion options showed by pycharm editor. Debugging the code does not throw any error(returns 0). What should I do to create tables?
I appreciate your consideration so much!
The code is using column to define columns in the table but it should be using Column (note the upper-case "C").
A few tips/comments
Pycharm may provide better support if you avoid the from module import * idiom. You can alias module names if they are to long to type, for example import sqlalchemy as sa
You can see the SQL generated by the engine by passing echo=True to create_engine
Tablenames with hyphens need to be quoted with backticks to be valid. Sqlalchemy does this automatically, but other applications may not. Using underscores instead may be more convenient.
The final code might look like this:
config
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
engine = create_engine('mysql://root:#localhost:3306/test', echo=True)
Model
import sqlachemy as sa
import config
class Sale(Base):
__tablename__ = 'sale-history'
order_id = sa.Column(sa.Integer, primary_key=True)
customer_id = sa.Column(sa.Integer)
item_id = sa.Column(sa.Integer) # foreign key with product list
cartFinalize_dateTime = sa.Column(sa.DATETIME)
amount_ordrered = sa.Column(sa.Integer)
city_name = sa.Column(sa.String(191))
quantity_ordered = sa.Column(sa.Integer)
def __repr__(self):
return "<Sale(city_name='%s')>" % (self.city_name)
Base.metadata.create_all(config.engine)
I am using this example to upload a csv file into a sqlite database:
this is my code:
from numpy import genfromtxt
from time import time
from datetime import datetime
from sqlalchemy import Column, Integer, Float, Date, String, VARCHAR
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
def Load_Data(file_name):
data = genfromtxt(file_name, delimiter=',')# skiprows=1, converters={0: lambda s: str(s)})
return data.tolist()
Base = declarative_base()
class cdb1(Base):
#Tell SQLAlchemy what the table name is and if there's any table-specific arguments it should know about
__tablename__ = 'cdb1'
__table_args__ = {'sqlite_autoincrement': True}
#tell SQLAlchemy the name of column and its attributes:
id = Column(Integer, primary_key=True, nullable=False)
name = Column(VARCHAR(40))
shack = Column(VARCHAR)
db = Column(Integer)
payments = Column(Integer)
status = Column(VARCHAR)
if __name__ == "__main__":
t = time()
print 'creating database'
#Create the database
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
#Create the session
session = sessionmaker()
session.configure(bind=engine)
s = session()
try:
file_name = 'client_db.csv'
data = Load_Data(file_name)
for i in data:
record = cdb1(**{
'name' : i[0],
'shack' : i[1],
'db' : i[2],
'payments' : i[3],
'status' : i[4]
})
s.add(record) #Add all the records
s.commit() #Attempt to commit all the records
except:
s.rollback() #Rollback the changes on error
print 'error in reading'
finally:
s.close() #Close the connection
print "Time elapsed: " + str(time() - t) + " s." #0.091s
and this is the first few rows of the csv file:
Name,Shack,DB,Payments,Status
Loyiso Dwala,I156,13542,37,LightsOnly ON
Attwell Fayo,I157,13077,32,LightsON
David Mbhele,G25,13155,33,LightsON
The DB is created ok, but only some of the data is captured into the attributes: the 'payments' and 'db' column are populated correctly, but everything else comes out as NULL.
UPDATED CORRECT CODE (using pandas dataframe):
from numpy import genfromtxt
from time import time
from datetime import datetime
from sqlalchemy import Column, Integer, Float, Date, String, VARCHAR
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import csv
import pandas as pd
#def Load_Data(file_name):
#data = csv.reader(file_name, delimiter=',')# skiprows=1, converters={0: lambda s: str(s)})
#return data.tolist()
Base = declarative_base()
class cdb1(Base):
#Tell SQLAlchemy what the table name is and if there's any table-specific arguments it should know about
__tablename__ = 'cdb1'
__table_args__ = {'sqlite_autoincrement': True}
#tell SQLAlchemy the name of column and its attributes:
id = Column(Integer, primary_key=True, nullable=False)
Name = Column(VARCHAR(40))
Shack = Column(VARCHAR)
DB = Column(Integer)
Payments = Column(Integer)
Status = Column(VARCHAR)
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
file_name = 'client_db.csv'
df = pd.read_csv(file_name)
df.to_sql(con=engine, index_label='id', name=cdb1.__tablename__, if_exists='replace')
Are you familiar with Pandas Dataframe?
Really simple to use (and debug)
pandas.read_csv(file_name)
In [5]: pandas.read_csv('/tmp/csvt.csv')
Out[5]:
Name Shack DB Payments Status
0 Loyiso Dwala I156 13542 37 LightsOnly ON
1 Attwell Fayo I157 13077 32 LightsON
2 David Mbhele G25 13155 33 LightsON
For inserting the DataFrames data into a table, you can simply use pandas.DataFrame.to_sql
So your main code will end up looking something like this:
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
file_name = 'client_db.csv'
df = pandas.read_csv(file_name)
df.to_sql(con=engine, index_label='id', name=cdb1.__tablename__, if_exists='replace')
You should read further in the documentation link I added, and set the function Parameters as suits your purpose (specially look at - if_exists, index, index_label, dtype)
I'm new to SQLAlchemy and relational databases, and I'm trying to set up a model for an annotated lexicon. I want to support an arbitrary number of key-value annotations for the words which can be added or removed at runtime. Since there will be a lot of repetition in the names of the keys, I don't want to use this solution directly, although the code is similar.
My design has word objects and property objects. The words and properties are stored in separate tables with a property_values table that links the two. Here's the code:
from sqlalchemy import Column, Integer, String, Table, create_engine
from sqlalchemy import MetaData, ForeignKey
from sqlalchemy.orm import relation, mapper, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///test.db', echo=True)
meta = MetaData(bind=engine)
property_values = Table('property_values', meta,
Column('word_id', Integer, ForeignKey('words.id')),
Column('property_id', Integer, ForeignKey('properties.id')),
Column('value', String(20))
)
words = Table('words', meta,
Column('id', Integer, primary_key=True),
Column('name', String(20)),
Column('freq', Integer)
)
properties = Table('properties', meta,
Column('id', Integer, primary_key=True),
Column('name', String(20), nullable=False, unique=True)
)
meta.create_all()
class Word(object):
def __init__(self, name, freq=1):
self.name = name
self.freq = freq
class Property(object):
def __init__(self, name):
self.name = name
mapper(Property, properties)
Now I'd like to be able to do the following:
Session = sessionmaker(bind=engine)
s = Session()
word = Word('foo', 42)
word['bar'] = 'yes' # or word.bar = 'yes' ?
s.add(word)
s.commit()
Ideally this should add 1|foo|42 to the words table, add 1|bar to the properties table, and add 1|1|yes to the property_values table. However, I don't have the right mappings and relations in place to make this happen. I get the sense from reading the documentation at http://www.sqlalchemy.org/docs/05/mappers.html#association-pattern that I want to use an association proxy or something of that sort here, but the syntax is unclear to me. I experimented with this:
mapper(Word, words, properties={
'properties': relation(Property, secondary=property_values)
})
but this mapper only fills in the foreign key values, and I need to fill in the other value as well. Any assistance would be greatly appreciated.
Simply use Dictionary-Based Collections mapping mapping - out of the box solution to your question. Extract from the link:
from sqlalchemy.orm.collections import column_mapped_collection, attribute_mapped_collection, mapped_collection
mapper(Item, items_table, properties={
# key by column
'notes': relation(Note, collection_class=column_mapped_collection(notes_table.c.keyword)),
# or named attribute
'notes2': relation(Note, collection_class=attribute_mapped_collection('keyword')),
# or any callable
'notes3': relation(Note, collection_class=mapped_collection(lambda entity: entity.a + entity.b))
})
# ...
item = Item()
item.notes['color'] = Note('color', 'blue')
print item.notes['color']
Or try the solution for Inserting data in Many to Many relationship in SQLAlchemy. Obviously you have to replace the list logic with the dict one.
Ask question author to post hist final code with associationproxy, which he mentioned he used in the end.
There is very similar question with slight interface difference. But it's easy to fix it by defining __getitem__, __setitem__ and __delitem__ methods.
Comment for Brent, above:
You can use session.flush() instead of commit() to get an id on your model instances. flush() will execute the necessary SQL, but will not commit, so you can rollback later if needed.
I ended up combining Denis and van's posts together to form the solution:
from sqlalchemy import Column, Integer, String, Table, create_engine
from sqlalchemy import MetaData, ForeignKey
from sqlalchemy.orm import relation, mapper, sessionmaker
from sqlalchemy.orm.collections import attribute_mapped_collection
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.ext.declarative import declarative_base
meta = MetaData()
Base = declarative_base(metadata=meta, name='Base')
class PropertyValue(Base):
__tablename__ = 'property_values'
WordID = Column(Integer, ForeignKey('words.id'), primary_key=True)
PropID = Column(Integer, ForeignKey('properties.id'), primary_key=True)
Value = Column(String(20))
def _property_for_name(prop_name):
return s.query(Property).filter_by(name=prop_name).first()
def _create_propval(prop_name, prop_val):
p = _property_for_name(prop_name)
if not p:
p = Property(prop_name)
s.add(p)
s.commit()
return PropertyValue(PropID=p.id, Value=prop_val)
class Word(Base):
__tablename__ = 'words'
id = Column(Integer, primary_key=True)
string = Column(String(20), nullable=False)
freq = Column(Integer)
_props = relation(PropertyValue, collection_class=attribute_mapped_collection('PropID'), cascade='all, delete-orphan')
props = association_proxy('_props', 'Value', creator=_create_propval)
def __init__(self, string, freq=1):
self.string = string
self.freq = freq
def __getitem__(self, prop):
p = _property_for_name(prop)
if p:
return self.props[p.id]
else:
return None
def __setitem__(self, prop, val):
self.props[prop] = val
def __delitem__(self, prop):
p = _property_for_name(prop)
if p:
del self.props[prop]
class Property(Base):
__tablename__ = 'properties'
id = Column(Integer, primary_key=True)
name = Column(String(20), nullable=False, unique=True)
def __init__(self, name):
self.name = name
engine = create_engine('sqlite:///test.db', echo=False)
Session = sessionmaker(bind=engine)
s = Session()
meta.create_all(engine)
The test code is as follows:
word = Word('foo', 42)
word['bar'] = "yes"
word['baz'] = "certainly"
s.add(word)
word2 = Word('quux', 20)
word2['bar'] = "nope"
word2['groink'] = "nope"
s.add(word2)
word2['groink'] = "uh-uh"
del word2['bar']
s.commit()
word = s.query(Word).filter_by(string="foo").first()
print word.freq, word['baz']
# prints 42 certainly
The contents of the databases are:
$ sqlite3 test.db "select * from property_values"
1|2|certainly
1|1|yes
2|3|uh-uh
$ sqlite3 test.db "select * from words"
1|foo|42
2|quux|20
$ sqlite3 test.db "select * from properties"
1|bar
2|baz
3|groink