Dynamic table creation SQLAlchemy ORM - python

I am new to SQLAlchemy ORM. I am trying to build a AWS S3 ingestion program which will ingest any CSV file from S3 bucket to Postgres through ORM. I am trying to read the first row of the CSV file and store the result into a list (columns_names). The code is giving an error:
could not assemble any primary key columns for mapped table.
The table is created in database only after declaring a PRIMARY KEY column. Is primary key mandatory for creating table via ORM? Also how do I dynamically create columns from list columns_names?
Here is my code:
import boto
import boto3
import botocore
import os
from datetime import datetime
import s3fs
import pandas as pd
import configparser
import re
from sqlalchemy import create_engine
from sqlalchemy import MetaData, Table, Column, Integer, String
from sqlalchemy.orm.session import sessionmaker
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
config = configparser.ConfigParser(allow_no_value=True)
config.read('IngestionConfig.config')
table_name = config.get('db-settings','table_name')
S3Bucket = config.get('AWS-settings','BucketName')
S3Key = config.get('AWS-settings','filename')
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket = S3Bucket, Key= S3Key)
file = response["Body"]
filedata = file.read()
contents = filedata.decode('utf-8')
first_line = contents.split('\n',1)[0]
col_names = re.sub(r"\s+", '_', first_line).replace('"', r'')
columns_names= []
columns_names = col_names.split(',')
postgresql_db = create_engine('postgresql://ayan.putatunda#localhost/postgres',echo = True)
Base = declarative_base()
class test(Base):
__tablename__ = table_name
for name in columns_names:
name = Column(String)
Base.metadata.create_all(postgresql_db)

SQLAlchemy ORM does require a primary key because its design requires a way to identify the row corresponding to object, so it's not possible to use table without primary key in ORM.
You can dynamically create tables by first creating a dictionary with your table information:
col_lst = ['col_1', 'col_2', 'col_3']
attr_dict = {'__tablename__': 'myTableName'}
for col in col_lst:
attr_dict[col] = Column(Integer)
Next using the type function create the table Class using SQLAlchemy’s declarative_base method:
Base = declarative_base()
MyTableClass = type('MyTableClass', (Base,), attr_dict)

Related

Creating additional column while importing csv via df.to_sql in sqlalchemy framework

I've to import csv data in sql using sqlAlchemy.
The csv has to columns (x, y) but I need to add a third column (delta_y) in the sql database to store processed data.
Using the following code it reads the csv to the sql database but is not creating the actual empty column in the database. Is there a smooth way to inherit was is mapped out in the class?
from sqlalchemy import Column, Integer, Float, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, update
engine = create_engine('sqlite:///hausarbeit_db.sqlite3', echo=True)
Base = declarative_base()
class Test(Base):
__tablename__ = "test"
id = Column(Integer, primary_key=True)
x = Column(Float)
y = Column(Float)
delta_y = Column(Float)
Base.metadata.create_all(engine)
file_name = 'Beispiel-Datensaetze//test.csv'
df = pd.read_csv(file_name)
df.to_sql('test', con=engine, index_label="id", if_exists='replace')
TEST = Base.metadata.tables['test']
I'm also happy to hear any other hints or tips around the code above.
Thanks!
Can't you add a new empty column in the data-frame after reading from the csv
df["delta_y"] = np.nan
# or
df["delta_y"] = ""

Search and Update datetime column with given time in mssql+pyodbc and sqlalchemy

I have been using MSSQL and pyodbc and updating a column with DateTime type by following the below thread:
How to update datetime field in MSSQL using python pyodbc module
Now I am also trying to incorporate sqlalchemy in my application stack. I am aware of following answer but it does not serve my purpose.
Datetime not updating on insert using SQLAlchemy on MSSQL
To elaborate on the problem:
from sqlalchemy import Column, String, DateTime, Date
from sqlalchemy.ext.declarative import declarative_base
import sqlalchemy
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from datetime import datetime as datetimemodule, date as datemodule
import traceback
BASE = declarative_base()
class REPR:
def make_repr(self, **kwrgs):
argument_string = ', '.join([f'{a}={b}' for a, b in locals()['kwrgs'].items()])
display_string = f'{self.__class__.__name__}({argument_string})'
return '*****' + display_string + '*****'
def __str__(self):
return self.__repr__()
class Recog_EntryLog(BASE, REPR):
# In database corresponding table has no primary key.
# As suggested in the documentation, largest candidate key is marked as
# primary key here.
__tablename__ = 'test'
id = Column(String(100), primary_key=True)
entrytime = Column(DateTime, primary_key=True) # I want to update this field by custom defined time.
# Not by the entry update time.
entrydate = Column(Date, primary_key=True)
def __repr__(self):
return self.make_repr(id=self.id, entrytime=self.entrytime, entrydate=self.entrydate)
def get_connection(dbname):
connection_info = f'mssql+pyodbc://remote:prashant#127.0.0.1:1433/{dbname}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(connection_info, echo=True)
session = sessionmaker(bind=engine)()
return session, engine
def close_connection(connection_info:sqlalchemy.orm.session.Session,
engine:sqlalchemy.engine.base.Engine):
connection_info.close()
engine.dispose()
def search_and_update(connected_session:sqlalchemy.orm.session.Session,
id,
_time:datetimemodule,
_date:datemodule):
res = connected_session.query(Recog_EntryLog).filter_by(id=id, entrydate=_date).order_by(Recog_EntryLog.entrytime.desc()).first()
print(res)
try:
if res is None:
connected_session.add(Recog_EntryLog(id=id, entrytime=_time, entrydate=_date))
else:
query = connected_session.query(Recog_EntryLog).filter_by(id=id, entrytime=res.entrytime, entrydate=_date)
query.update({Recog_EntryLog.entrytime: _time})
connected_session.commit()
except:
traceback.print_exc()
connected_session.rollback()
if __name__ == '__main__':
connected_session, connected_engine = get_connection('authentication')
# Will create new entry on first run/Will update old entry
search_and_update(connected_session, '1234', datetimemodule.now(), datemodule.today())
# should update datetime
search_and_update(connected_session, '1234', datetimemodule.now(), datemodule.today())
# should update datetime
search_and_update(connected_session, '1234', datetimemodule.now(), datemodule.today())
close_connection(connected_session, connected_engine)
I was expecting that last two red values will be updated with the first two blue values. Generated SQL does not seem to follow first attach link sql.
Can some help me how to properly write such function using sqlalchemy?

Create sqlite database from Access

I would like to create a sqlite Database coming from an access database backend.
In case of 64bit/32bit --> pyocdb doesnt work. So I exported some excel files.
A fast solution to start would be:
import os
import pandas as pd
from sqlalchemy import create_engine
#load all files in folder
folder = "...some start folder"
files = {file.split('.')[0]:os.path.join(folder, file)
for file in os.listdir(folder) if file.endswith('.xlsx')}
list_dfs = {name:pd.read_excel(file) for name,file in files.items()}
#initialize a sqlite database
engine = create_engine('sqlite:///sql.db', echo=False)
#drop tables to sql
for key, frame in list_dfs.items():
frame.to_sql(key, con=engine, if_exists='append',index=False,index_label='ID')
I could add some dTypes in frame.to_sql within a dict.
I struggle building the relations between the tables.
sqlalchemy seems to be a good solution, but is it possible to format an existing database?
regards
inco
#PowerStat, thanks for the correction.
My current solution seems to work so far:
import os
import pandas as pd
from sqlalchemy import (create_engine, MetaData, Table, Column, Integer,
String, ForeignKey, DateTime, Float)
#%%load all excel filenames into two directories main/id
folder = "some_path"
ID_files = {file.split('.')[0]:os.path.join(folder, file)
for file in os.listdir(folder) if "ID_" in str(file) and file.endswith('.xlsx')}
main_files={file.split('.')[0]:os.path.join(folder, file)
for file in os.listdir(folder) if not "ID_" in str(file) and file.endswith('.xlsx')}
#create a list of dataframes
ID_dfs = {name:pd.read_excel(file) for name,file in ID_files.items()}
main_dfs = {name:pd.read_excel(file) for name,file in main_files.items()}
#%%initialize meta data for all tables
engine = create_engine('sqlite:///sql.db', echo=False)
meta = MetaData()
for key, frame in ID_dfs.items():
table = Table(key, meta,
Column('ID',Integer,primary_key = True),
Column('Title',String, unique=True))
table = Table('table_of_things', meta,
Column('ID',Integer, primary_key = True),
Column('Book_ID',Integer,ForeignKey('ID_Book.ID')),
Column('Article_ID',Integer,ForeignKey('ID_Article.ID')))
meta.create_all(engine)
#add the Dataframes to sql-database
for key, frame in ID_dfs.items():
frame.to_sql(key, engine, if_exists='append',index=False, index_label='ID')
main_dfs['table_of_things'].to_sql('table_of_things', engine,
if_exists='append',index=False,index_label='ID')
With a second script, I load the database and make a query:
from sqlalchemy.ext.automap import automap_base, generate_relationship
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
def _gen_relationship(base, direction, return_fn,
attrname, local_cls, refferred_cls, **kw):
return generate_relationship(base, direction, return_fn, attrname, local_cls, refferred_cls, **kw)
Base = automap_base()
# engine, suppose it has two tables 'user' and 'address' set up
engine = create_engine("sqlite:///sql.db")
# reflect the tables
Base.prepare(engine, reflect=True, generate_relationship=_gen_relationship)
#table to classvariable
tob = Base.classes['table_of_things']
session = Session(engine)
for inst in session.query(tob).order_by(tob.ID):
print(inst.Book_ID, inst.Article_ID)
The output gives me the ID values ​​but not the title values, how can I use the one2many relationship correctly?
Answer:
The relationships is defined adequatly!
The instance object (inst) already contains id_book and id_article:
for inst in session.query(tob).order_by(tob.ID):
print(inst.id_book.Title, inst.id_article.Title)

SQLAlchemy - How to select cerain rows?

How to simulate a SELECT in SQLAlchemy? I would like to create a function which takes a couple of parameters and returns a row which contains those values but I can't do SELECT.
The only way I found is below but I can't find metadata in SQLAlchemy module.
EDIT: I figured out that BoundMetaData is deprecated so MetaData is appropriate, but it says that Select has no len
# -*- coding: utf-8 -*-
import sqlalchemy
from sqlalchemy import Column, Table
from sqlalchemy import UniqueConstraint
from sqlalchemy import create_engine
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///db.db', echo=False)
Base = declarative_base()
s = sqlalchemy.orm.Session(engine)
class Flight(Base):
__tablename__ = 'flights'
id = Column(sqlalchemy.Integer, primary_key=True)
destination_from = Column(sqlalchemy.String)
destination_to = Column(sqlalchemy.String)
creation_date = Column(sqlalchemy.Date)
start_date = Column(sqlalchemy.Date)
return_date = Column(sqlalchemy.Date)
price = Column(sqlalchemy.Float)
filename = Column(sqlalchemy.String)
bought_days_before = Column(sqlalchemy.Integer)
__table_args__ = (
UniqueConstraint('creation_date', 'destination_from', 'destination_to', 'start_date', 'return_date', 'price'),
)
Base.metadata.create_all(engine)
def insert_into_flights(**kwargs):
s.add(Flight(**kwargs))
try:
s.commit()
except IntegrityError as e:
s.rollback()
def get_prices(date_from, days, bought_days_before, destination, min=True, avg=False):
flights = Table('flights', metadata ,autoload=True
)
print len(flights.select())
s.query(Flight).filter(Flight.id==34).all()
This is an example selecting the Flight with id 34.
See SQLAlchemy docs

SQLAlchemy ORM conversion to pandas DataFrame

Is there a solution converting a SQLAlchemy <Query object> to a pandas DataFrame?
Pandas has the capability to use pandas.read_sql but this requires use of raw SQL. I have two reasons for wanting to avoid it:
I already have everything using the ORM (a good reason in and of itself) and
I'm using python lists as part of the query, e.g.:
db.session.query(Item).filter(Item.symbol.in_(add_symbols) where Item is my model class and add_symbols is a list). This is the equivalent of SQL SELECT ... from ... WHERE ... IN.
Is anything possible?
Below should work in most cases:
df = pd.read_sql(query.statement, query.session.bind)
See pandas.read_sql documentation for more information on the parameters.
Just to make this more clear for novice pandas programmers, here is a concrete example,
pd.read_sql(session.query(Complaint).filter(Complaint.id == 2).statement,session.bind)
Here we select a complaint from complaints table (sqlalchemy model is Complaint) with id = 2
For completeness sake: As alternative to the Pandas-function read_sql_query(), you can also use the Pandas-DataFrame-function from_records() to convert a structured or record ndarray to DataFrame.
This comes in handy if you e.g. have already executed the query in SQLAlchemy and have the results already available:
import pandas as pd
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import scoped_session, sessionmaker
SQLALCHEMY_DATABASE_URI = 'postgresql://postgres:postgres#localhost:5432/my_database'
engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True, echo=False)
db = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine))
Base = declarative_base(bind=engine)
class Currency(Base):
"""The `Currency`-table"""
__tablename__ = "currency"
__table_args__ = {"schema": "data"}
id = Column(Integer, primary_key=True, nullable=False)
name = Column(String(64), nullable=False)
# Defining the SQLAlchemy-query
currency_query = db.query(Currency).with_entities(Currency.id, Currency.name)
# Getting all the entries via SQLAlchemy
currencies = currency_query.all()
# We provide also the (alternate) column names and set the index here,
# renaming the column `id` to `currency__id`
df_from_records = pd.DataFrame.from_records(currencies
, index='currency__id'
, columns=['currency__id', 'name'])
print(df_from_records.head(5))
# Or getting the entries via Pandas instead of SQLAlchemy using the
# aforementioned function `read_sql_query()`. We can set the index-columns here as well
df_from_query = pd.read_sql_query(currency_query.statement, db.bind, index_col='id')
# Renaming the index-column(s) from `id` to `currency__id` needs another statement
df_from_query.index.rename(name='currency__id', inplace=True)
print(df_from_query.head(5))
The selected solution didn't work for me, as I kept getting the error
AttributeError: 'AnnotatedSelect' object has no attribute 'lower'
I found the following worked:
df = pd.read_sql_query(query.statement, engine)
If you want to compile a query with parameters and dialect specific arguments, use something like this:
c = query.statement.compile(query.session.bind)
df = pandas.read_sql(c.string, query.session.bind, params=c.params)
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
engine = create_engine('postgresql://postgres:postgres#localhost:5432/DB', echo=False)
Base = declarative_base(bind=engine)
Session = sessionmaker(bind=engine)
session = Session()
conn = session.bind
class DailyTrendsTable(Base):
__tablename__ = 'trends'
__table_args__ = ({"schema": 'mf_analysis'})
company_code = Column(DOUBLE_PRECISION, primary_key=True)
rt_bullish_trending = Column(Integer)
rt_bearish_trending = Column(Integer)
rt_bullish_non_trending = Column(Integer)
rt_bearish_non_trending = Column(Integer)
gen_date = Column(Date, primary_key=True)
df_query = select([DailyTrendsTable])
df_data = pd.read_sql(rt_daily_query, con = conn)
Using the 2.0 SQLalchemy syntax (available also in 1.4 with the flag future=True) it looks that pd.read_sql is not implemented yet and it will raise:
NotImplementedError: This method is not implemented for SQLAlchemy 2.0.
This is an open issue that won't be solved till pandas 2.0, you can find some information about this here and here.
I didn't find any satisfactory work around, but some people seems to be using two configurations of the engine, one with the flag future False:
engine2 = create_engine(URL_string, echo=False, future=False)
This solution would be OK if you query strings, but using the ORM, the best I could do is a custom function yet to be optimized, but it works:
Conditions = session.query(ExampleTable)
def df_from_sql(query):
return pd.DataFrame([i.__dict__ for i in query]).drop(columns='_sa_instance_state')
df = df_from_sql(ExampleTable)
This solution in any case would be provisional till pd.read_sql has implemented the new syntax.
When you're using the ORM it's as simple as this:
pd.DataFrame([r._asdict() for r in query.all()])
Good alternative to pd.read_sql when you don't want to expose sql and sessions to the business logic code.
Found it here: https://stackoverflow.com/a/52208023/1635525
This answer provides a reproducible example using an SQL Alchemy select statement and returning a pandas data frame. It is based on an in memory SQLite database so that anyone can reproduce it without installing a database engine.
import pandas
from sqlalchemy import create_engine
from sqlalchemy import MetaData, Table, Column, Text
from sqlalchemy.orm import Session
Define table metadata and create a table
engine = create_engine('sqlite://')
meta = MetaData()
meta.bind = engine
user_table = Table('user', meta,
Column("name", Text),
Column("full_name", Text))
user_table.create()
Insert some data into the user table
stmt = user_table.insert().values(name='Bob', full_name='Sponge Bob')
with Session(engine) as session:
result = session.execute(stmt)
session.commit()
Read the result of a select statement into a pandas data frame
# Select data into a pandas data frame
stmt = user_table.select().where(user_table.c.name == 'Bob')
df = pandas.read_sql_query(stmt, engine)
df
Out:
name full_name
0 Bob Sponge Bob
if use SQL query
def generate_df_from_sqlquery(query):
from pandas import DataFrame
query = db.session.execute(query)
df = DataFrame(query.fetchall())
if len(df) > 0:
df.columns = query.keys()
else:
columns = query.keys()
df = pd.DataFrame(columns=columns)
return df
profile_df = generate_df_from_sqlquery(profile_query)
Simple example using the CursorResult.keys() method to get the column names.
import sqlalchemy as sa
import pandas as pd
engine = sa.create_engine(...)
with engine.connect() as conn:
result = conn.execute("SELECT * FROM foo;")
df = pd.DataFrame(result.all(), columns=result.keys())
https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Result.keys
Adding to answers using read_sql like #van, when my query involved a join, sqlalchemy seemed to be implicitly adding aliased columns from the join tables like id_1, id_2 incase the join tables and primary table both had an id column for example. Using .all() removes these implicit columns before returning results but read_sql will include these columns.
Solutions for that case for me was to be explicit on my selects. So I replaced
query = session.query(model)
with
query = session.query(model.col_1, model.col_2)
or for select all
query = session.query(*model.__table__.columns.values())
then
df = pd.read_sql(query.statement, query.session.bind)

Categories

Resources