Snowflake: SQL compilation error: error line invalid identifier '"dateutc"' - python

I'm moving data from Postgres to snowflake. Originally it worked however I've added:
df_postgres["dateutc"]= pd.to_datetime(df_postgres["dateutc"])
because the date format was incorrectly loading to snowflake and now I see this error:
SQL compilation error: error line 1 at position 87 invalid identifier
'"dateutc"'
Here is my code:
from sqlalchemy import create_engine
import pandas as pd
import glob
import os
from config import postgres_user, postgres_pass, host,port, postgres_db, snow_user, snow_pass,snow_account,snow_warehouse
from snowflake.connector.pandas_tools import pd_writer
from snowflake.sqlalchemy import URL
from sqlalchemy.dialects import registry
registry.register('snowflake', 'snowflake.sqlalchemy', 'dialect')
engine = create_engine(f'postgresql://{postgres_user}:{postgres_pass}#{host}:{port}/{postgres_db}')
conn = engine.connect()
#reads query
df_postgres = pd.read_sql("SELECT * FROM rok.my_table", conn)
#dropping these columns
drop_cols=['RPM', 'RPT']
df_postgres.drop(drop_cols, inplace=True, axis=1)
#changed columns to lowercase
df_postgres.columns = df_postgres.columns.str.lower()
df_postgres["dateutc"]= pd.to_datetime(df_postgres["dateutc"])
print(df_postgres.dateutc.dtype)
sf_conn = create_engine(URL(
account = snow_account,
user = snow_user,
password = snow_pass,
database = 'test',
schema = 'my_schema',
warehouse = 'test',
role = 'test',
))
df_postgres.to_sql(name='my_table',
index = False,
con = sf_conn,
if_exists = 'append',
chunksize = 300,
method = pd_writer)

Moving Ilja's answer from comment to answer for completeness:
Snowflake is case sensitive.
When writing "unquoted" SQL, Snowflake will convert table names and fields to uppercase.
This usually works, until someone decides to start quoting their identifiers in SQL.
pd_writer adds quotes to identifiers.
Hence when you have df_postgres["dateutc"] it remains in lowercase when its transformed into a fully quoted query.
Writing df_postgres["DATEUTC"] in Python should fix the issue.

Related

Inserting a pandas dataframe into snowflake table using sqlalchemy

I'm trying to insert a pandas data frame into snowflake table using sqlalchemy
My dataframe looks like
df =
FRUITS VEGETABLES
0 apple potato
1 banana onion
2 mango beans
My code:
import pandas as pd
import sqlalchemy
from snowflake.connector.pandas_tools import pd_writer
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
# account details
account_identifier = 'account_identifier'
user = 'user_login_name'
password = 'password'
database_name = 'database_name'
schema_name = 'schema_name'
conn_string = f"snowflake://{user}:{password}#{account_identifier}/{database_name}/{schema_name}"
engine = create_engine(conn_string)
table_name = 'my_table'
if_exists = 'append'
if __name__ = '__main__':
df = pd.read_csv('my.csv')
with engine.connect() as con:
df.to_sql(name=table_name.lower(), con=con, if_exists=if_exists, index=False, method=pd_writer)
I'm getting an error:
snowflake.connector.errors.ProgrammingError: SQL compilation error: error line 1 at position 79
invalid identifier '"FRUITS"'
I don't understand why this is giving an error even though my table schema has only two columns

Trouble With SQL Query in Python

Hello I'm getting an error: near "join": syntax error. Is there an obvious issue with this that I'm not picking up on? I've changed names in the query but I've gone over and checked for spelling errors already.
import pandas as pd
import sqlite3 as sql
path1 = r'C:\file.xlsx'
path2 = r'C:\file2.xlsx'
tenants = pd.read_excel(path1, sheet_name='1')
buildings = pd.read_excel(path2)
db = sql.connect('temp.db')
tenants.to_sql('tenantsdb', db)
buildings.to_sql('buildingsdb', db)
Query = pd.read_sql("select t.*, b.distance from tenantsdb t where city = 'city' join buildingsdb b on t.Address = b.Street_Address;", db)
db.close()
In SQL, the order of clauses is SELECT, FROM, JOIN, WHERE. You have JOIN in the wrong place.
Query = pd.read_sql("""
select t.*, b.distance
from tenantsdb t
join buildingsdb b on t.Address = b.Street_Address
where city = 'city';""", db)

JSON response into database

Ok, I have tried several kinds of solutions recommended by others on this site and other sited. However, I can't get it work as I would like it to do.
I get a XML-response which I normalize and then save to a CSV. This first part works fine.
Instead of saving it to CSV I would like to save it into an existing table in an access database. The second part below:
Would like to use an existing table instead of creating a new one
The result is not separated with ";" into different columns. Everything ends up in the same column not separated, see image below
response = requests.get(u,headers=h).json()
dp = pd.json_normalize(response,'Units')
response_list.append(dp)
export = pd.concat(response_list)
export.to_csv(r'C:\Users\username\Documents\Python Scripts\Test\Test2_'+str(now)+'.csv', index=False, sep=';',encoding='utf-8')
access_path = r"C:\Users\username\Documents\Python Scripts\Test\Test_db.accdb"
conn = pyodbc.connect("DRIVER={{Microsoft Access Driver (*.mdb, *.accdb)}};DBQ={};" \
.format(access_path))
strSQL = "SELECT * INTO projects2 FROM [text;HDR=Yes;FMT=sep(;);" + \
"Database=C:\\Users\\username\\Documents\\Python Scripts\\Test].Testdata.csv;"
cur = conn.cursor()
cur.execute(strSQL)
conn.commit()
conn.close()
If you already have the data in a well-formed pandas DataFrame then you don't really need to dump it to a CSV file; you can use the sqlalchemy-access dialect to push the data directly into an Access table using pandas' to_sql() method:
from pprint import pprint
import urllib
import pandas as pd
import sqlalchemy as sa
connection_string = (
r"DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};"
r"DBQ=C:\Users\Public\Database1.accdb;"
r"ExtendedAnsiSQL=1;"
)
connection_uri = f"access+pyodbc:///?odbc_connect={urllib.parse.quote_plus(connection_string)}"
engine = sa.create_engine(connection_uri)
with engine.begin() as conn:
# existing data in table
pprint(
conn.execute(sa.text("SELECT * FROM user_table")).fetchall(), width=30
)
"""
[('gord', 'gord#example.com'),
('jennifer', 'jennifer#example.com')]
"""
# DataFrame to insert
df = pd.DataFrame(
[
("newdev", "newdev#example.com"),
("newerdev", "newerdev#example.com"),
],
columns=["username", "email"],
)
df.to_sql("user_table", engine, index=False, if_exists="append")
with engine.begin() as conn:
# updated table
pprint(
conn.execute(sa.text("SELECT * FROM user_table")).fetchall(), width=30
)
"""
[('gord', 'gord#example.com'),
('jennifer', 'jennifer#example.com'),
('newdev', 'newdev#example.com'),
('newerdev', 'newerdev#example.com')]
"""
(Disclosure: I am currently the maintainer of the sqlalchemy-access dialect.)
Solved with the following code
SE_export_Tuple = list(zip(SE_export.Name,SE_export.URL,SE_export.ImageUrl,......,SE_export.ID))
print(SE_export_Tuple)
access_path = r"C:\Users\username\Documents\Python Scripts\Test\Test_db.accdb"
conn = pyodbc.connect("DRIVER={{Microsoft Access Driver (*.mdb, *.accdb)}};DBQ={};" \
.format(access_path))
cursor = conn.cursor()
mySql_insert_query="INSERT INTO Temp_table (UnitName,URL,ImageUrl,.......,ID) VALUES (?,?,?,......,?)"
cursor.executemany(mySql_insert_query,SE_export_Tuple)
conn.commit()
conn.close()
However, when I add many fields I get an error at "executemany", saying:
cursor.executemany(mySql_insert_query,SE_export_Tuple)
Error: ('HY004', '[HY004] [Microsoft][ODBC Microsoft Access Driver]Invalid SQL data type (67) (SQLBindParameter)')

How to raise exception when column name too long in Postgres + SQLalchemy?

I have a script in python that uploads files to a Postgres database server. These files are then converted to SQL tables. For this, I'm using the SQLalchemy library.
The problem arises when the column names are too long. I don't want Postgres to truncate the column names automatically when they exceed the maximum length (if I recall correctly, it's 63 in Postgres). The tables end up having columns with unintelligible names and I would just rather have the script to cancel the upload.
The obvious solution is to just "hardcode" in my script the maximum length and just raise an exception if someone tries to upload a table with "too long" column names. Nevertheless, I think that this should be configurable in SQLalchemy as, for example, it raises an exception when the table name is already in use in the database.
Extract from my script to upload table:
from SQLalchemy import (
create_engine,
)
import pandas as pd
DB_CONFIG_DICT = {
'user': "user",
'host': "urlforhost.com",
'port': 5432,
'password': "password"
}
DB_CONN_FORMAT = "postgresql+psycopg2://{user}:{password}#{host}:{port}/{database}"
DB_CONN_URI_DEFAULT = (DB_CONN_FORMAT.format( database='sandbox', **DB_CONFIG_DICT))
engine = create_engine(DB_CONN_URI_DEFAULT)
path = "file.csv"
table_name = "table_name"
df = pd.read_csv(path, decimal=r".")
df.columns = [c.lower() for c in df.columns] #postgres doesn't like capitals or spaces
df.to_sql(table_name, engine)
I hope this can help you.
def check_column_name(name):
if len(name) > 63:
raise ValueError("column name (%s) is too long" % name)
df.columns = [c.lower() for c in df.columns]
map(check_column_name, df.columns) # Check the column name before import
df.to_sql(table_name, engine)

SQLAlchemy ORM conversion to pandas DataFrame

Is there a solution converting a SQLAlchemy <Query object> to a pandas DataFrame?
Pandas has the capability to use pandas.read_sql but this requires use of raw SQL. I have two reasons for wanting to avoid it:
I already have everything using the ORM (a good reason in and of itself) and
I'm using python lists as part of the query, e.g.:
db.session.query(Item).filter(Item.symbol.in_(add_symbols) where Item is my model class and add_symbols is a list). This is the equivalent of SQL SELECT ... from ... WHERE ... IN.
Is anything possible?
Below should work in most cases:
df = pd.read_sql(query.statement, query.session.bind)
See pandas.read_sql documentation for more information on the parameters.
Just to make this more clear for novice pandas programmers, here is a concrete example,
pd.read_sql(session.query(Complaint).filter(Complaint.id == 2).statement,session.bind)
Here we select a complaint from complaints table (sqlalchemy model is Complaint) with id = 2
For completeness sake: As alternative to the Pandas-function read_sql_query(), you can also use the Pandas-DataFrame-function from_records() to convert a structured or record ndarray to DataFrame.
This comes in handy if you e.g. have already executed the query in SQLAlchemy and have the results already available:
import pandas as pd
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import scoped_session, sessionmaker
SQLALCHEMY_DATABASE_URI = 'postgresql://postgres:postgres#localhost:5432/my_database'
engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True, echo=False)
db = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine))
Base = declarative_base(bind=engine)
class Currency(Base):
"""The `Currency`-table"""
__tablename__ = "currency"
__table_args__ = {"schema": "data"}
id = Column(Integer, primary_key=True, nullable=False)
name = Column(String(64), nullable=False)
# Defining the SQLAlchemy-query
currency_query = db.query(Currency).with_entities(Currency.id, Currency.name)
# Getting all the entries via SQLAlchemy
currencies = currency_query.all()
# We provide also the (alternate) column names and set the index here,
# renaming the column `id` to `currency__id`
df_from_records = pd.DataFrame.from_records(currencies
, index='currency__id'
, columns=['currency__id', 'name'])
print(df_from_records.head(5))
# Or getting the entries via Pandas instead of SQLAlchemy using the
# aforementioned function `read_sql_query()`. We can set the index-columns here as well
df_from_query = pd.read_sql_query(currency_query.statement, db.bind, index_col='id')
# Renaming the index-column(s) from `id` to `currency__id` needs another statement
df_from_query.index.rename(name='currency__id', inplace=True)
print(df_from_query.head(5))
The selected solution didn't work for me, as I kept getting the error
AttributeError: 'AnnotatedSelect' object has no attribute 'lower'
I found the following worked:
df = pd.read_sql_query(query.statement, engine)
If you want to compile a query with parameters and dialect specific arguments, use something like this:
c = query.statement.compile(query.session.bind)
df = pandas.read_sql(c.string, query.session.bind, params=c.params)
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
engine = create_engine('postgresql://postgres:postgres#localhost:5432/DB', echo=False)
Base = declarative_base(bind=engine)
Session = sessionmaker(bind=engine)
session = Session()
conn = session.bind
class DailyTrendsTable(Base):
__tablename__ = 'trends'
__table_args__ = ({"schema": 'mf_analysis'})
company_code = Column(DOUBLE_PRECISION, primary_key=True)
rt_bullish_trending = Column(Integer)
rt_bearish_trending = Column(Integer)
rt_bullish_non_trending = Column(Integer)
rt_bearish_non_trending = Column(Integer)
gen_date = Column(Date, primary_key=True)
df_query = select([DailyTrendsTable])
df_data = pd.read_sql(rt_daily_query, con = conn)
Using the 2.0 SQLalchemy syntax (available also in 1.4 with the flag future=True) it looks that pd.read_sql is not implemented yet and it will raise:
NotImplementedError: This method is not implemented for SQLAlchemy 2.0.
This is an open issue that won't be solved till pandas 2.0, you can find some information about this here and here.
I didn't find any satisfactory work around, but some people seems to be using two configurations of the engine, one with the flag future False:
engine2 = create_engine(URL_string, echo=False, future=False)
This solution would be OK if you query strings, but using the ORM, the best I could do is a custom function yet to be optimized, but it works:
Conditions = session.query(ExampleTable)
def df_from_sql(query):
return pd.DataFrame([i.__dict__ for i in query]).drop(columns='_sa_instance_state')
df = df_from_sql(ExampleTable)
This solution in any case would be provisional till pd.read_sql has implemented the new syntax.
When you're using the ORM it's as simple as this:
pd.DataFrame([r._asdict() for r in query.all()])
Good alternative to pd.read_sql when you don't want to expose sql and sessions to the business logic code.
Found it here: https://stackoverflow.com/a/52208023/1635525
This answer provides a reproducible example using an SQL Alchemy select statement and returning a pandas data frame. It is based on an in memory SQLite database so that anyone can reproduce it without installing a database engine.
import pandas
from sqlalchemy import create_engine
from sqlalchemy import MetaData, Table, Column, Text
from sqlalchemy.orm import Session
Define table metadata and create a table
engine = create_engine('sqlite://')
meta = MetaData()
meta.bind = engine
user_table = Table('user', meta,
Column("name", Text),
Column("full_name", Text))
user_table.create()
Insert some data into the user table
stmt = user_table.insert().values(name='Bob', full_name='Sponge Bob')
with Session(engine) as session:
result = session.execute(stmt)
session.commit()
Read the result of a select statement into a pandas data frame
# Select data into a pandas data frame
stmt = user_table.select().where(user_table.c.name == 'Bob')
df = pandas.read_sql_query(stmt, engine)
df
Out:
name full_name
0 Bob Sponge Bob
if use SQL query
def generate_df_from_sqlquery(query):
from pandas import DataFrame
query = db.session.execute(query)
df = DataFrame(query.fetchall())
if len(df) > 0:
df.columns = query.keys()
else:
columns = query.keys()
df = pd.DataFrame(columns=columns)
return df
profile_df = generate_df_from_sqlquery(profile_query)
Simple example using the CursorResult.keys() method to get the column names.
import sqlalchemy as sa
import pandas as pd
engine = sa.create_engine(...)
with engine.connect() as conn:
result = conn.execute("SELECT * FROM foo;")
df = pd.DataFrame(result.all(), columns=result.keys())
https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Result.keys
Adding to answers using read_sql like #van, when my query involved a join, sqlalchemy seemed to be implicitly adding aliased columns from the join tables like id_1, id_2 incase the join tables and primary table both had an id column for example. Using .all() removes these implicit columns before returning results but read_sql will include these columns.
Solutions for that case for me was to be explicit on my selects. So I replaced
query = session.query(model)
with
query = session.query(model.col_1, model.col_2)
or for select all
query = session.query(*model.__table__.columns.values())
then
df = pd.read_sql(query.statement, query.session.bind)

Categories

Resources