SqlAlchemy / Python Datatypes - python

I am struggling to get my python and sql datatype speaking to each other. I'm not sure what I am missing here.
My Code:
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
from sqlalchemy import MetaData
from sqlalchemy import Table
from sqlalchemy import Column
from sqlalchemy import Integer, String, Float, DATE, DECIMAL
def db_conn():
global conn
global engine
db_uri = "sqlite:///db.sqlite"
engine = create_engine(db_uri)
conn = engine.connect()
meta_db = MetaData(engine)
print("Connected to Database at: " + str(datetime.today()))
return conn
db_conn()
# create table
meta = MetaData(engine)
table = Table('MarketData2', meta,
Column('DT', Integer, primary_key=True),
Column('Currency', String),
Column('Signal', String),
Column('Value', Float))
meta.create_all()
def update_MarketData_db(dt, currency, sig, val):
# insert multiple data
conn.execute(table.insert(),[
{'DT': str(dt),
'Currency': str(currency),
'Signal': str(sig),
'Value': float(val)}])
update_MarketData_db('2014-05-04', 'test', 'test_blac', 420.0)
As you can see, I explicitly declare each datatype in the function. Also I have tried the Float, FLOAT, Decimal and DECIMAL classes in sqlalchemy and that hasnt worked either.
The traceback:
IntegrityError: (sqlite3.IntegrityError) datatype mismatch [SQL: 'INSERT INTO "MarketData2" ("DT", "Currency", "Signal", "Value") VALUES (?, ?, ?, ?)'] [parameters: ('2014-05-04', 'test', 'test_blac', 420.0)] (Background on this error at: http://sqlalche.me/e/gkpj)

Related

How to create index for a SQLite3 database using SQLAlchemy?

I have multiple SQLite3 databases for which the models are not available.
def index_db(name, tempdb):
print(f'{name.ljust(padding)} Indexing file: {tempdb}')
if tempdb.endswith('primary.sqlite'):
conn = sqlite3.connect(tempdb)
conn.execute('CREATE INDEX packageSource ON packages (rpm_sourcerpm)')
conn.commit()
conn.close()
How can I perform the same operation using SQLAlchemy?
I can come up with two ways to add that index through SQLAlchemy:
if you do not reflect, execute the SQL statement directly
if you reflect you table/model, add an index to it
Firstly, let's create the table to work on.
import sqlite3
con = sqlite3.connect("/tmp/73526761.db")
con.execute("CREATE TABLE t73526761 (id INT PRIMARY KEY, name VARCHAR)")
con.commit()
con.close()
Then, without reflecting, you can execute your raw SQL with the following.
import sqlalchemy as sa
engine = sa.create_engine("sqlite:////tmp/73526761.db", future=True)
with engine.begin() as con:
con.execute(sa.text("CREATE INDEX t73526761_name_idx ON t73526761 (name)"))
con.commit()
Or if you reflect the table only (SQLAlchemy core):
import sqlalchemy as sa
metadata_obj = sa.MetaData()
engine = sa.create_engine("sqlite:////tmp/73526761.db", future=True)
t73526761 = sa.Table("t73526761", metadata_obj, autoload_with=engine)
t73526761_name_idx = sa.Index("t73526761_name_idx", t73526761.c.name)
t73526761_name_idx.create(bind=engine) # emits CREATE INDEX t73526761_name_idx ON t73526761 (name)
Or if you reflect the model (SQLAlchemy orm):
import sqlalchemy as sa
from sqlalchemy import orm
Base = orm.declarative_base()
engine = sa.create_engine("sqlite:////tmp/73526761.db", future=True)
class K73526761(Base):
__table__ = sa.Table("t73526761", Base.metadata, autoload_with=engine)
t73526761_name_idx = sa.Index("t73526761_name_idx", K73526761.name)
t73526761_name_idx.create(bind=engine) # emits CREATE INDEX t73526761_name_idx ON t73526761 (name)

Invalid Parameter Type (numpy.int64) using pandas df.to_sql with sqlalchemy and pyodbc with mssql

I have a script where I need to append the contents of a dataframe to a SQL database table I created. I need to do this many times to several tables with several dataframes as sources.
I am using Pandas with a sqlalchemy engine on a pyodbc connection to an MSSQL database
To ensure that I am only appending the data from the dataframe which has a corresponding column in the database, I have an "append data to sql" function:
def append_data_to_sql(db_connection, new_rows: pd.DataFrame, table_name: str) -> bool:
# Get column names for the destination table
query = 'SELECT column_name, data_type ' \
'FROM information_schema.columns ' \
'WHERE table_name=?'
result = db_connection.execute(query, table_name).fetchall()
columns_in_sql = pd.DataFrame(data=result, columns=['COLUMN_NAME', 'DATA_TYPE'])
new_table = pd.DataFrame(columns=list(columns_in_sql['COLUMN_NAME']))
new_rows.columns = new_rows.columns.str.lower()
new_table.columns = new_table.columns.str.lower()
# Only keep the columns that are in destination and if there is no
# column in the data to be appended then create an empty column
for column in new_table.columns:
if column in new_rows.columns:
new_table[column] = new_rows[column]
else:
new_table[column] = pd.NA
try:
new_table.to_sql(table_name, db_connection, if_exists='append', index=False)
except sqlalchemy.exc.DBAPIError as e:
logging.exception(f'Error while appending to {table_name}: {e}', exc_info=True)
return True
return False
The context data I'm passing to my function is:
new_rows = pd.DataFrame.from_records({
'system': 'the_system_name',
'data_update_time': Timestamp('2022-03-02 10:00:48.958701'),
'first_available_data_point': None,
'last_available_data_point': None,
'line_name': 'the_line_name',
'server': 'the_server_name',
'day_start_hours': 0.0,
'bu': 'the_bu_name',
'number_of_constraints': 3
})
columns_in_sql = pd.DataFrame.(data= [
('system', 'varchar'),
('data_update_time', 'datetime'),
('first_available_data_point', 'datetime'),
('last_available_data_point', 'datetime'),
('line_name', 'varchar'),
('server', 'varchar'),
('day_start_hours', 'numeric'),
('bu', 'varchar'),
('number_of_constraints', 'int')
], columns=['COLUMN_NAME', 'DATA_TYPE'])
The error that I am getting is:
sqlalchemy.exc.ProgrammingError: (pyodbc.ProgrammingError) ('Invalid parameter type. param-index=8 param-type=numpy.int64', 'HY105')
[SQL: INSERT INTO my_table (system, data_update_time, first_available_data_point, last_available_data_point, line_name, server, day_start_hours, bu, number_of_constraints) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: ('the_system_name', Timestamp('2022-03-02 10:00:48.958701'), None, None, 'the_line_name', 'the_server_name', 0.0, 'the_bu_name', 3)]
(Background on this error at: https://sqlalche.me/e/14/f405)
The issue is clearly that the '3' on the end is the wrong kind of integer for the SQL database, and I found a similar issue which addresses this exact issue, but with a direct executemany() call through pyodbc.
The problem is that I'm trying to use SqlAlchemy through Pandas, so I'm not actually creating the insert statement myself.
I've tried specifying the dtype of each column by adding:
from sqlalchemy import types
sql_dtypes = {'varchar': types.String(),
'int': types.SmallInteger(),
'datetime': types.DateTime(),
'date': types.Date(),
'nvarchar': types.String(),
'numeric': types.Numeric(),
'float': types.Float(),
'real': types.Float(),
'bool': types.Boolean(),
}
for index, row in columns_in_sql.iterrows():
new_dtypes[row['COLUMN_NAME']] = sql_dtypes[row['DATA_TYPE']]
and adding the dtype arg to to_sql:
new_table.to_sql(table_name, db_connection, if_exists='append', index=False, dtype=new_dtypes)
I then tried all the different Integer Types on the SqlAlchemy docs page, Integer(), BigInteger(), SmallInteger(), with the same error.
I'm hoping I can find a solution for this here before I re-write the function to do all the things pandas and sqlalchemy should (I think) be taking care of already.

pandas DataFrame upsert to SQLite

All I want is a simple Upsert from the DataFrame to SQLite. However, since pd.to_sql() does not have Upsert, I had to implement it with SQLAlchemy instead.
SQLite:
CREATE TABLE test (col1 INTEGER, col2 text, col3 REAL, PRIMARY KEY(col1, col2));
python:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import Table
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.ext.automap import automap_base
def test_upsert():
df = pd.DataFrame({'col1':1, 'col2':'a', 'col3':1.5}, index=[0])
sql_url = 'sqlite:///testDB.db'
table = 'test'
engine = create_engine(sql_url)
with engine.connect() as conn:
base = automap_base()
base.prepare(engine, reflect=True)
target_table = Table(table, base.metadata, autoload=True, autoload_with=engine)
stmt = insert(target_table).values(df.to_dict(orient='records'))
update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}
conn.execute(stmt.on_conflict_do_update(constraint=f'{table}_pkey', set_=update_dict))
The script above works with Postgres previously but it keeps giving me the error when used with SQLite.
sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near "ON": syntax error
[SQL: INSERT INTO test (col1, col2, col3) VALUES (?, ?, ?) ON CONFLICT (test_pkey) DO UPDATE SET col3 = excluded.col3]
[parameters: (1, 'a', 1.5)]
(Background on this error at: http://sqlalche.me/e/14/e3q8)
I'm not sure what I did wrong, or if there's any better solution since it seems like a very common operation.
Any help is appreciated.

How can I Insert the value of CURRENT TIMESTAMP using SQLAlchemy's connection.execute

I want to insert multiple rows using connection.execute, and one of the columns must be set to the result of the database's CURRENT_TIMESTAMP function.
For example, given this table:
import sqlalchemy as sa
metadata = sa.MetaData()
foo = sa.Table('foo', metadata,
sa.Column('id', sa.Integer, primary_key=True),
sa.Column('ts', sa.TIMESTAMP))
# I'm using Sqlite for this example, but this question
# is database-agnostic.
engine = sa.create_engine('sqlite://', echo=True)
metadata.create_all(engine)
I can insert a single row like this:
conn = engine.connect()
with conn.begin():
ins = foo.insert().values(ts=sa.func.current_timestamp())
conn.execute(ins)
However when I try to insert multiple rows:
with conn.begin():
ins = foo.insert()
conn.execute(ins, [{'ts': sa.func.current_timestamp()}])
a TypeError is raised:
sqlalchemy.exc.StatementError: (builtins.TypeError) SQLite DateTime type only accepts Python datetime and date objects as input.
[SQL: INSERT INTO foo (ts) VALUES (?)]
[parameters: [{'ts': <sqlalchemy.sql.functions.current_timestamp at 0x7f3607e21070; current_timestamp>}]
Replacing the function with the string "CURRENT_TIMESTAMP" results in a similar error.
Is there a way to get the database to set the column to CURRENT_TIMESTAMP using connection.execute?
I'm aware that I can work around this by querying for the value of CURRENT_TIMESTAMP within the same transaction and using that value in the INSERT values, or executing and UPDATE after the INSERT. I'm specifically asking whether this can be done in connection.execute's *multiparams argument.
It's a hack for sure, but this appears to work for SQLite at least:
from datetime import datetime
from pprint import pprint
import sqlalchemy as sa
engine = sa.create_engine("sqlite:///:memory:")
metadata = sa.MetaData()
foo = sa.Table(
"foo",
metadata,
sa.Column("id", sa.Integer, primary_key=True, autoincrement=True),
sa.Column("ts", sa.TIMESTAMP),
sa.Column("txt", sa.String(50)),
)
foo.create(engine)
with engine.begin() as conn:
ins_query = str(foo.insert().compile()).replace(
" :ts, ", " CURRENT_TIMESTAMP, "
)
print(ins_query)
# INSERT INTO foo (id, ts, txt) VALUES (:id, CURRENT_TIMESTAMP, :txt)
data = [{"id": None, "txt": "Alfa"}, {"id": None, "txt": "Bravo"}]
conn.execute(sa.text(ins_query), data)
print(datetime.now())
# 2021-03-06 17:41:35.743452
# (local time here is UTC-07:00)
results = conn.execute(sa.text("SELECT * FROM foo")).fetchall()
pprint(results, width=60)
"""
[(1, '2021-03-07 00:41:35', 'Alfa'),
(2, '2021-03-07 00:41:35', 'Bravo')]
"""

SQLAlchemy: How to cast string value to date in multiple insert statement?

I have a list of dict objects
data = [
{'id': 1, 'dt':'2002-01-02' },
{'id': 2, 'dt':'2014-01-15' },
{'id': 3, 'dt':'2005-10-20' }
]
and a table in sqlite created using sqlalchemy as follows
engine = create_engine(config.SQLALCHEMY_DATABASE_URI)
metadata = MetaData()
tbl1 = Table('t1', metadata,
Column('the_id', Integer, primary_key=True),
Column('the_dt', Date))
metadata.create_all(bind=engine)
stmt_insert = tbl1.insert().values(the_id=bindparam('id'), the_dt=bindparam('dt', type_=Date)
with engine.connect() as conn:
conn.execute(stmt_insert, data)
This gives me the following error:
sqlalchemy.exc.StatementError: (builtins.TypeError) SQLite Date type
only accepts Python date objects as input.
What do I assign to the "type_" parameter to make this binding work ?
You need to convert your dt strings to date objects, for instance:
import datetime
for item in data:
item['dt'] = datetime.datetime.strptime(item['dt'], "%Y-%m-%d").date()
If you don't need the ORM part of SQLAlchemy (no class/table mapping). The easiest way is to tell SQLAlchemy that you use String instead of Date, like this:
tbl1 = Table('t1', metadata,
Column('the_id', Integer, primary_key=True),
Column('the_dt', String))
It will work because your date string use ISO 8601 format ("%Y-%m-%d").
But the best practices are:
read records from CSV
convert data to Python objects (int, date, etc.)
insert data in database.
The conversion can be done in the constructor of the class which is mapped to the table.
EDIT A kind of "solution"
from sqlalchemy import Table, Column, Integer, MetaData, Date, String
from sqlalchemy import create_engine
from sqlalchemy.sql import select
engine = create_engine('sqlite:///:memory:', echo=True)
metadata = MetaData()
I've created a "fake" table with a String type instead of a Date:
fake_tbl1 = Table('t1', metadata,
Column('id', Integer, primary_key=True),
Column('dt', String))
metadata.create_all(bind=engine)
And insert the data as-is:
data = [
{'id': 1, 'dt': '2002-01-02'},
{'id': 2, 'dt': '2014-01-15'},
{'id': 3, 'dt': '2005-10-20'}
]
stmt_insert = fake_tbl1.insert()
with engine.connect() as conn:
conn.execute(stmt_insert, data)
Then I redefine the same table with the required Date field:
tbl2 = Table('t1', metadata,
Column('id', Integer, primary_key=True),
Column('dt', Date),
extend_existing=True)
Here is a rows selection:
stmt_select = select([tbl2])
with engine.connect() as conn:
result = conn.execute(stmt_select)
for row in result:
print(row)
You'll get:
(1, datetime.date(2002, 1, 2))
(2, datetime.date(2014, 1, 15))
(3, datetime.date(2005, 10, 20))
This works for your simple case, but I won't recommend this for a generic solution.

Categories

Resources