SQLAlchemy - copy schema and data of subquery to another database - python

I am trying to copy data from a subquery from postgres (from_engine) to sqlite database. I can achieve this for copying a table using following command:
smeta = MetaData(bind=from_engine)
table = Table(table_name, smeta, autoload=True)
table.metadata.create_all(to_engine)
However, I am not sure how to achieve the same for a subquery statement.
-Sandeep
Edit:
Follow up on the answer. Once I have created the table I want to create a subquery stmt as follows:
table = Table("newtable", dest_metadata, *columns)
stmt = dest_session.query(table).subquery();
However, the last stmt ends up with error
cursor.execute(statement, parameters)
sqlalchemy.exc.ProgrammingError: (ProgrammingError) relation "newtable" does not exist
LINE 3: FROM newtable) AS anon_1

One way that works at least in some cases:
Use column_descriptions of a query object to get some information about the columns in the result set.
With that information you can build the schema to create the new table in the other database.
Run the query in the source database and insert the results into the new table.
First of some setup for the example:
from sqlalchemy import create_engine, MetaData,
from sqlalchemy import Column, Integer, String, Table
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
# Engine to the database to query the data from
# (postgresql)
source_engine = create_engine('sqlite:///:memory:', echo=True)
SourceSession = sessionmaker(source_engine)
# Engine to the database to store the results in
# (sqlite)
dest_engine = create_engine('sqlite:///:memory:', echo=True)
DestSession = sessionmaker(dest_engine)
# Create some toy table and fills it with some data
Base = declarative_base()
class Pet(Base):
__tablename__ = 'pets'
id = Column(Integer, primary_key=True)
name = Column(String)
race = Column(String)
Base.metadata.create_all(source_engine)
sourceSession = SourceSession()
sourceSession.add(Pet(name="Fido", race="cat"))
sourceSession.add(Pet(name="Ceasar", race="cat"))
sourceSession.add(Pet(name="Rex", race="dog"))
sourceSession.commit()
Now to the interesting bit:
# This is the query we want to persist in a new table:
query= sourceSession.query(Pet.name, Pet.race).filter_by(race='cat')
# Build the schema for the new table
# based on the columns that will be returned
# by the query:
metadata = MetaData(bind=dest_engine)
columns = [Column(desc['name'], desc['type']) for desc in query.column_descriptions]
column_names = [desc['name'] for desc in query.column_descriptions]
table = Table("newtable", metadata, *columns)
# Create the new table in the destination database
table.create(dest_engine)
# Finally execute the query
destSession = DestSession()
for row in query:
destSession.execute(table.insert(row))
destSession.commit()
There should be more efficient ways to do the last loop. But bulk-insert is another topic.

You can also go through a pandas data frame. For example a method would use pandas.read_sql(query, source.connection) and df.to_sql(table_name, con=destination.connection).

Related

Creating additional column while importing csv via df.to_sql in sqlalchemy framework

I've to import csv data in sql using sqlAlchemy.
The csv has to columns (x, y) but I need to add a third column (delta_y) in the sql database to store processed data.
Using the following code it reads the csv to the sql database but is not creating the actual empty column in the database. Is there a smooth way to inherit was is mapped out in the class?
from sqlalchemy import Column, Integer, Float, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, update
engine = create_engine('sqlite:///hausarbeit_db.sqlite3', echo=True)
Base = declarative_base()
class Test(Base):
__tablename__ = "test"
id = Column(Integer, primary_key=True)
x = Column(Float)
y = Column(Float)
delta_y = Column(Float)
Base.metadata.create_all(engine)
file_name = 'Beispiel-Datensaetze//test.csv'
df = pd.read_csv(file_name)
df.to_sql('test', con=engine, index_label="id", if_exists='replace')
TEST = Base.metadata.tables['test']
I'm also happy to hear any other hints or tips around the code above.
Thanks!
Can't you add a new empty column in the data-frame after reading from the csv
df["delta_y"] = np.nan
# or
df["delta_y"] = ""

How to create index for a SQLite3 database using SQLAlchemy?

I have multiple SQLite3 databases for which the models are not available.
def index_db(name, tempdb):
print(f'{name.ljust(padding)} Indexing file: {tempdb}')
if tempdb.endswith('primary.sqlite'):
conn = sqlite3.connect(tempdb)
conn.execute('CREATE INDEX packageSource ON packages (rpm_sourcerpm)')
conn.commit()
conn.close()
How can I perform the same operation using SQLAlchemy?
I can come up with two ways to add that index through SQLAlchemy:
if you do not reflect, execute the SQL statement directly
if you reflect you table/model, add an index to it
Firstly, let's create the table to work on.
import sqlite3
con = sqlite3.connect("/tmp/73526761.db")
con.execute("CREATE TABLE t73526761 (id INT PRIMARY KEY, name VARCHAR)")
con.commit()
con.close()
Then, without reflecting, you can execute your raw SQL with the following.
import sqlalchemy as sa
engine = sa.create_engine("sqlite:////tmp/73526761.db", future=True)
with engine.begin() as con:
con.execute(sa.text("CREATE INDEX t73526761_name_idx ON t73526761 (name)"))
con.commit()
Or if you reflect the table only (SQLAlchemy core):
import sqlalchemy as sa
metadata_obj = sa.MetaData()
engine = sa.create_engine("sqlite:////tmp/73526761.db", future=True)
t73526761 = sa.Table("t73526761", metadata_obj, autoload_with=engine)
t73526761_name_idx = sa.Index("t73526761_name_idx", t73526761.c.name)
t73526761_name_idx.create(bind=engine) # emits CREATE INDEX t73526761_name_idx ON t73526761 (name)
Or if you reflect the model (SQLAlchemy orm):
import sqlalchemy as sa
from sqlalchemy import orm
Base = orm.declarative_base()
engine = sa.create_engine("sqlite:////tmp/73526761.db", future=True)
class K73526761(Base):
__table__ = sa.Table("t73526761", Base.metadata, autoload_with=engine)
t73526761_name_idx = sa.Index("t73526761_name_idx", K73526761.name)
t73526761_name_idx.create(bind=engine) # emits CREATE INDEX t73526761_name_idx ON t73526761 (name)

Copy one database to another using SQLAlchemy

I'm trying to copy a database using SQLAlchemy. The first attempt was:
from from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import sessionmaker
from urls import engine_urls
engine1 = create_engine(engine_urls[0])
engine2 = create_engine(engine_urls[1])
metadata = MetaData()
metadata.reflect(engine1)
tables = metadata.tables
metadata.create_all(engine2)
Session1 = sessionmaker(bind=engine1)
from sqlalchemy import insert
with Session1.begin() as session:
for key in tables:
table_object = tables[key]
for row in session.query(table_object):
s = insert(table_object).\
values(**dict(zip(row.keys(), row)))
engine2.execute(s)
But this code does not work since the order in which inserts are done is arbitrary and this violates FK constraints. For example, inserting a child before a parent will cause such a violation. How could I achieve this task? Is there a part of the framework that would do this easily? I can't find it.
Here is what I use. Works well.
from sqlalchemy import create_engine, MetaData, event
from sqlalchemy.sql import sqltypes
# Requires SQLALCHEMY 1.4+
src_engine = create_engine("sqlite:///mydb.sqlite")
src_metadata = MetaData(bind=src_engine)
exclude_tables = ('sqlite_master', 'sqlite_sequence', 'sqlite_temp_master')
tgt_engine = create_engine("postgresql+psycopg2://#localhost/ngas")
tgt_metadata = MetaData(bind=tgt_engine)
#event.listens_for(src_metadata, "column_reflect")
def genericize_datatypes(inspector, tablename, column_dict):
column_dict["type"] = column_dict["type"].as_generic(allow_nulltype=True)
tgt_conn = tgt_engine.connect()
tgt_metadata.reflect()
# drop all tables in target database
for table in reversed(tgt_metadata.sorted_tables):
if table.name not in exclude_tables:
print('dropping table =', table.name)
table.drop()
# # Delete all data in target database
# for table in reversed(tgt_metadata.sorted_tables):
# table.delete()
tgt_metadata.clear()
tgt_metadata.reflect()
src_metadata.reflect()
# create all tables in target database
for table in src_metadata.sorted_tables:
if table.name not in exclude_tables:
table.create(bind=tgt_engine)
# refresh metadata before you can copy data
tgt_metadata.clear()
tgt_metadata.reflect()
# Copy all data from src to target
for table in tgt_metadata.sorted_tables:
src_table = src_metadata.tables[table.name]
stmt = table.insert()
for index, row in enumerate(src_table.select().execute()):
print("table =", table.name, "Inserting row", index)
stmt.execute(row._asdict())
if anyone had difficulties executing the proposed
routine as a solution because "stmt.execute(row._asdict())"
generates an error in version 1.4, here is an alternative
that I successfully produced:
# Copy all data from src to target
for table in tgt_metadata.sorted_tables:
src_table = src_metadata.tables[table.name]
for index, row in enumerate(src_table.select().execute()):
print("table =", table.name, "Inserting row", index, '>>', dict(row))
stmt = table.insert().values(row._asdict())
tgt_conn.execute(stmt)
tgt_conn.commit()

SQLAlchemy ORM conversion to pandas DataFrame

Is there a solution converting a SQLAlchemy <Query object> to a pandas DataFrame?
Pandas has the capability to use pandas.read_sql but this requires use of raw SQL. I have two reasons for wanting to avoid it:
I already have everything using the ORM (a good reason in and of itself) and
I'm using python lists as part of the query, e.g.:
db.session.query(Item).filter(Item.symbol.in_(add_symbols) where Item is my model class and add_symbols is a list). This is the equivalent of SQL SELECT ... from ... WHERE ... IN.
Is anything possible?
Below should work in most cases:
df = pd.read_sql(query.statement, query.session.bind)
See pandas.read_sql documentation for more information on the parameters.
Just to make this more clear for novice pandas programmers, here is a concrete example,
pd.read_sql(session.query(Complaint).filter(Complaint.id == 2).statement,session.bind)
Here we select a complaint from complaints table (sqlalchemy model is Complaint) with id = 2
For completeness sake: As alternative to the Pandas-function read_sql_query(), you can also use the Pandas-DataFrame-function from_records() to convert a structured or record ndarray to DataFrame.
This comes in handy if you e.g. have already executed the query in SQLAlchemy and have the results already available:
import pandas as pd
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import scoped_session, sessionmaker
SQLALCHEMY_DATABASE_URI = 'postgresql://postgres:postgres#localhost:5432/my_database'
engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True, echo=False)
db = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine))
Base = declarative_base(bind=engine)
class Currency(Base):
"""The `Currency`-table"""
__tablename__ = "currency"
__table_args__ = {"schema": "data"}
id = Column(Integer, primary_key=True, nullable=False)
name = Column(String(64), nullable=False)
# Defining the SQLAlchemy-query
currency_query = db.query(Currency).with_entities(Currency.id, Currency.name)
# Getting all the entries via SQLAlchemy
currencies = currency_query.all()
# We provide also the (alternate) column names and set the index here,
# renaming the column `id` to `currency__id`
df_from_records = pd.DataFrame.from_records(currencies
, index='currency__id'
, columns=['currency__id', 'name'])
print(df_from_records.head(5))
# Or getting the entries via Pandas instead of SQLAlchemy using the
# aforementioned function `read_sql_query()`. We can set the index-columns here as well
df_from_query = pd.read_sql_query(currency_query.statement, db.bind, index_col='id')
# Renaming the index-column(s) from `id` to `currency__id` needs another statement
df_from_query.index.rename(name='currency__id', inplace=True)
print(df_from_query.head(5))
The selected solution didn't work for me, as I kept getting the error
AttributeError: 'AnnotatedSelect' object has no attribute 'lower'
I found the following worked:
df = pd.read_sql_query(query.statement, engine)
If you want to compile a query with parameters and dialect specific arguments, use something like this:
c = query.statement.compile(query.session.bind)
df = pandas.read_sql(c.string, query.session.bind, params=c.params)
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
engine = create_engine('postgresql://postgres:postgres#localhost:5432/DB', echo=False)
Base = declarative_base(bind=engine)
Session = sessionmaker(bind=engine)
session = Session()
conn = session.bind
class DailyTrendsTable(Base):
__tablename__ = 'trends'
__table_args__ = ({"schema": 'mf_analysis'})
company_code = Column(DOUBLE_PRECISION, primary_key=True)
rt_bullish_trending = Column(Integer)
rt_bearish_trending = Column(Integer)
rt_bullish_non_trending = Column(Integer)
rt_bearish_non_trending = Column(Integer)
gen_date = Column(Date, primary_key=True)
df_query = select([DailyTrendsTable])
df_data = pd.read_sql(rt_daily_query, con = conn)
Using the 2.0 SQLalchemy syntax (available also in 1.4 with the flag future=True) it looks that pd.read_sql is not implemented yet and it will raise:
NotImplementedError: This method is not implemented for SQLAlchemy 2.0.
This is an open issue that won't be solved till pandas 2.0, you can find some information about this here and here.
I didn't find any satisfactory work around, but some people seems to be using two configurations of the engine, one with the flag future False:
engine2 = create_engine(URL_string, echo=False, future=False)
This solution would be OK if you query strings, but using the ORM, the best I could do is a custom function yet to be optimized, but it works:
Conditions = session.query(ExampleTable)
def df_from_sql(query):
return pd.DataFrame([i.__dict__ for i in query]).drop(columns='_sa_instance_state')
df = df_from_sql(ExampleTable)
This solution in any case would be provisional till pd.read_sql has implemented the new syntax.
When you're using the ORM it's as simple as this:
pd.DataFrame([r._asdict() for r in query.all()])
Good alternative to pd.read_sql when you don't want to expose sql and sessions to the business logic code.
Found it here: https://stackoverflow.com/a/52208023/1635525
This answer provides a reproducible example using an SQL Alchemy select statement and returning a pandas data frame. It is based on an in memory SQLite database so that anyone can reproduce it without installing a database engine.
import pandas
from sqlalchemy import create_engine
from sqlalchemy import MetaData, Table, Column, Text
from sqlalchemy.orm import Session
Define table metadata and create a table
engine = create_engine('sqlite://')
meta = MetaData()
meta.bind = engine
user_table = Table('user', meta,
Column("name", Text),
Column("full_name", Text))
user_table.create()
Insert some data into the user table
stmt = user_table.insert().values(name='Bob', full_name='Sponge Bob')
with Session(engine) as session:
result = session.execute(stmt)
session.commit()
Read the result of a select statement into a pandas data frame
# Select data into a pandas data frame
stmt = user_table.select().where(user_table.c.name == 'Bob')
df = pandas.read_sql_query(stmt, engine)
df
Out:
name full_name
0 Bob Sponge Bob
if use SQL query
def generate_df_from_sqlquery(query):
from pandas import DataFrame
query = db.session.execute(query)
df = DataFrame(query.fetchall())
if len(df) > 0:
df.columns = query.keys()
else:
columns = query.keys()
df = pd.DataFrame(columns=columns)
return df
profile_df = generate_df_from_sqlquery(profile_query)
Simple example using the CursorResult.keys() method to get the column names.
import sqlalchemy as sa
import pandas as pd
engine = sa.create_engine(...)
with engine.connect() as conn:
result = conn.execute("SELECT * FROM foo;")
df = pd.DataFrame(result.all(), columns=result.keys())
https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Result.keys
Adding to answers using read_sql like #van, when my query involved a join, sqlalchemy seemed to be implicitly adding aliased columns from the join tables like id_1, id_2 incase the join tables and primary table both had an id column for example. Using .all() removes these implicit columns before returning results but read_sql will include these columns.
Solutions for that case for me was to be explicit on my selects. So I replaced
query = session.query(model)
with
query = session.query(model.col_1, model.col_2)
or for select all
query = session.query(*model.__table__.columns.values())
then
df = pd.read_sql(query.statement, query.session.bind)

creating a temporary table from a query using sqlalchemy orm

I can create a temporary table this way:
session.execute("CREATE TABLE temptable SELECT existingtable.id, "
"existingtable.column2 FROM existingtable WHERE existingtable.id<100000")
but the new table is unreadable because it says it has no primary key. existingtable.id is the primary key of exisitingtable, so I expected it to get the same treatment in the temp table.
However, I would rather find some ORM way of doing this anyway. Given:
temp_table = Table('temptable', metadata,
Column('id', Integer, primary_key=True),
Column('column2', Integer),
useexisting=True )
class TempTable(object):
pass
mapper(TempTable, temp_table)
temp_table.create(bind=session.bind, checkfirst=True)
if session.query(TempTable).delete(): #make sure it's empty
session.commit()
How can I populate temp_table with some selected contents of existingtable without doing 100000 session.query.add(TempTable(...)) commands? Or is there a way of creating the table from a query similar to the plain SQL version above?
It's not exactly ORM, but to create the table initially, I'd clone the table structure (see cloneTable in the example below). For copying the data, I then would use the InsertFromSelect example.
Edit: Since version 0.8.3, SqlAlchemy supports Insert.from_select() out of the box. Hence the InsertFromSelect class and the respective visitor in the example below can be directly replaced and are no longer needed. I leave the original example unchanged for historic reasons.
Here is a working example
from sqlalchemy import Table
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql.expression import UpdateBase
class InsertFromSelect(UpdateBase):
def __init__(self, table, select):
self.table = table
self.select = select
#compiles(InsertFromSelect)
def visit_insert_from_select(element, compiler, **kw):
return "INSERT INTO %s %s" % (
compiler.process(element.table, asfrom=True),
compiler.process(element.select)
)
def cloneTable(name, table, metadata):
cols = [c.copy() for c in table.columns]
constraints = [c.copy() for c in table.constraints]
return Table(name, metadata, *(cols + constraints))
# test data
from sqlalchemy import MetaData, Column, Integer
from sqlalchemy.engine import create_engine
e = create_engine('sqlite://')
m = MetaData(e)
t = Table('t', m, Column('id', Integer, primary_key=True),
Column('number', Integer))
t.create()
e.execute(t.insert().values(id=1, number=3))
e.execute(t.insert().values(id=9, number=-3))
# create temp table
temp = cloneTable('temp', t, m)
temp.create()
# copy data
ins = InsertFromSelect(temp, t.select().where(t.c.id>5))
e.execute(ins)
# print result
for r in e.execute(temp.select()):
print(r)

Categories

Resources