Invalid Identifier programming error in snowflake - python

I am running the following query in snowflake and it runs fine.
set id ='TEST_TABLE1';
set time_s = '2021-03-31 06:52:51+00:00';
merge into TEST_STATUS using (select column1 AS TABLENAME,
column2 AS LASTUPDATED from values ($id,$time_s)) tt
on TEST_STATUS.TABLE_NAME = tt.TABLENAME
when matched then update set TEST_STATUS.LAST_UPDATED = tt.LASTUPDATED
when not matched then insert (TABLE_NAME, LAST_UPDATED) values (tt.TABLENAME, tt.LASTUPDATED)
But when I try to run it via python code as following:
self.table = 'TEST_TABLE1'
self.timestamp='2021-03-31 06:52:51+00:00';
cmd = f"set id ={self.table};"
cmd2 = f"set time_s = str({timestamp});"
merge_cmd = f"merge into {self.table} using (select column1 AS TABLENAME, column2 AS LASTUPDATED from " \
f"values ($id,$time_s)) tt on {self.table}.TABLE_NAME = tt.TABLENAME when " \
f"matched then update set {self.status_tbl}.LAST_UPDATED = tt.LASTUPDATED when not matched then " \
f"insert (TABLE_NAME, LAST_UPDATED) values (tt.TABLENAME, tt.LASTUPDATED) "
self.snowflake_client.run(cmd)
self.snowflake_client.run(cmd2)
self.snowflake_client.run(merge_cmd)
I am getting exception as:
snowflake.connector.errors.ProgrammingError: 000904 (42000): SQL compilation error: error line 1 at position 14
invalid identifier 'TEST_TABLE1'

Can you add single quotation marks when assigning to variables?
self.table = 'TEST_TABLE1'
self.timestamp='2021-03-31 06:52:51+00:00'
cmd = f"set id = '{self.table}';"
cmd2 = f"set time_s = 'str({timestamp})';"
merge_cmd = f"merge into {self.table} using (select column1 AS TABLENAME, column2 AS LASTUPDATED from " \
f"values ('$id','$time_s')) tt on {self.table}.TABLE_NAME = tt.TABLENAME when " \
f"matched then update set {self.status_tbl}.LAST_UPDATED = tt.LASTUPDATED when not matched then " \
f"insert (TABLE_NAME, LAST_UPDATED) values (tt.TABLENAME, tt.LASTUPDATED) "
self.snowflake_client.run(cmd)
self.snowflake_client.run(cmd2)
self.snowflake_client.run(merge_cmd)

Related

How to write a function that runs certain SQL on certain columns in a PySpark dataframe?

I wrote some code and have this as output. The left side is basically the columns of a dataframe that I'm working with, and the right side is the SQL query that needs to be run on that particular column.
Now I want to write a function that runs the queries on the right on the columns on the left and display the output.
The first picture is basically the values of the 'Column' and 'Query' columns of another dataframe. I used .collect() methods to retrieve those values.
This seemed like a simple problem but I'm still stuck at it. Any idea how to do it?
Using a subset of your data
data_ls = [
('maxpulse', 'select count(*) from {table} where {col} is null'),
('duration', 'select round((count(distinct {col}) / count({col})) * 100) from {table}')
]
data_sdf = spark.sparkContext.parallelize(data_ls).toDF(['column', 'query'])
# +--------+-----------------------------------------------------------------------+
# |column |query |
# +--------+-----------------------------------------------------------------------+
# |maxpulse|select count(*) from {table} where {col} is null |
# |duration|select round((count(distinct {col}) / count({col})) * 100) from {table}|
# +--------+-----------------------------------------------------------------------+
Approach 1: Using UDF
def createQuery(query_string, column_name=None, table_name=None):
if column_name is not None and table_name is None:
fnlquery = query_string.format(col=column_name, table='{table}')
elif column_name is None and table_name is not None:
fnlquery = query_string.format(col='{col}', table=table_name)
elif column_name is not None and table_name is not None:
fnlquery = query_string.format(col=column_name, table=table_name)
else:
fnlquery = query_string
return fnlquery
createQueryUDF = func.udf(createQuery, StringType())
data_sdf. \
withColumn('final_query', createQueryUDF('query', 'column')). \
select('final_query'). \
show(truncate=False)
# +-----------------------------------------------------------------------------+
# |final_query |
# +-----------------------------------------------------------------------------+
# |select count(*) from {table} where maxpulse is null |
# |select round((count(distinct duration) / count(duration)) * 100) from {table}|
# +-----------------------------------------------------------------------------+
Approach 2: Using regexp_replace() sql function
data_sdf. \
withColumn('final_query', func.expr('regexp_replace(query, "[\{]col[\}]", column)')). \
select('final_query'). \
show(truncate=False)
# +-----------------------------------------------------------------------------+
# |final_query |
# +-----------------------------------------------------------------------------+
# |select count(*) from {table} where maxpulse is null |
# |select round((count(distinct duration) / count(duration)) * 100) from {table}|
# +-----------------------------------------------------------------------------+
Similar approach can be used to replace '{table}' with a table name. The final queries from the final_query field can then be collected (using .collect()) and used further to run sql queries.
query_list = data_sdf. \
withColumn('final_query', func.expr('regexp_replace(query, "[\{]col[\}]", column)')). \
select('final_query'). \
rdd.map(lambda x: x.final_query). \
collect()
# ['select count(*) from {table} where maxpulse is null',
# 'select round((count(distinct duration) / count(duration)) * 100) from {table}']
# run the queries by iterating over the list
for query in query_list:
spark.sql(query)
You can put column names and queries to a dictionary:
dct = {'column_name': 'SELECT * FROM table WHERE {col} IS NULL'}
for k, v in dct.items():
q = v.format(col = k)
# spark.sql(q)
print(q)
Output:
SELECT * FROM table WHERE column_name IS NULL

SQLAlchemy not execute a query

I'm having a problem with sqlAlchemy when i try to execute a query. My script has been working fine and every query.execute worked good until now. Here is the code:
for i in listaUnificacion:
usu = "'AUTO'"
incabuniper = "'S'"
sCodPersonaPr, sPers = i[0], i[1]
engine = sqla.create_engine(URL_ORACLE)
connection = engine.connect()
seq_query = sqla.Sequence('SEQ_PERUNI')
pnCodSecPerUni = connection.execute(seq_query)
query = "INSERT INTO TABLE1(SEC, CD, CDUNIF, DATE, USU, INCABUNIPER) VALUES({0}, {1}, {2}, SYSDATE, {3}, {4})".format(pnCodSecPerUni, sCodPersonaPr, sPers, str(usu), str(incabuniper))
query = sqla.text(query)
print(query)
connection.execute(query)
query = "UPDATE TABLE2 SET type = 'M' WHERE cd = {}".format(sPers);
connection.execute(query)
query_uni = "DECLARE\
res varchar2(100);\
errorm varchar2(1000);\
BEGIN\
res := USER.FNC({0},{1},{2},'AUTO',errorm);\
END;".format(pnCodSecPerUni, sCodPersonaPr, sPers)
query_uni = sqla.text(query_unifica)
connection.execute(query_uni)
connection.close()
When I try to execute query_unifica, it doesn't work but it doesn't show any error. I put here the execution with some prints:
PARES
(11005202, 11002071)
INSERT INTO TABLE1(SEC, CD, CDUNIF,, DATE, USU, INCABUNIPER)
VALUES(1628226, 11005202, 11002071, SYSDATE, 'AUTO', 'S') --> WORKS FINE
UPDATE TABLE2 SET type = 'M' WHERE cd = 11002071 --> works fine
DECLARE res varchar2(100); errorm
varchar2(1000); BEGIN res :=
USER.FNC(1628226,11005202,11002071,'AUTO',errorm); END; --
> DOSEN'T WORK!!!

Speeding up insertion of point data from netcdf

I've got this netcdf of weather data (one of thousands that require postgresql ingestion). I'm currently capable of inserting each band into a postgis-enabled table at a rate of about 20-23 seconds per band. (for monthly data, there is also daily data that i have yet to test.)
I've heard of different ways of speeding this up using COPY FROM, removing the gid, using ssds, etc... but I'm new to python and have no idea how to store the netcdf data to something I could use COPY FROM or what the best route might be.
If anyone has any other ideas on how to speed this up, please share!
Here is the ingestion script
import netCDF4, psycopg2, time
# Establish connection
db1 = psycopg2.connect("host=localhost dbname=postgis_test user=********** password=********")
cur = db1.cursor()
# Create Table in postgis
print(str(time.ctime()) + " CREATING TABLE")
try:
cur.execute("DROP TABLE IF EXISTS table_name;")
db1.commit()
cur.execute(
"CREATE TABLE table_name (gid serial PRIMARY KEY not null, thedate DATE, thepoint geometry, lon decimal, lat decimal, thevalue decimal);")
db1.commit()
print("TABLE CREATED")
except:
print(psycopg2.DatabaseError)
print("TABLE CREATION FAILED")
rawvalue_nc_file = 'netcdf_file.nc'
nc = netCDF4.Dataset(rawvalue_nc_file, mode='r')
nc.variables.keys()
lat = nc.variables['lat'][:]
lon = nc.variables['lon'][:]
time_var = nc.variables['time']
dtime = netCDF4.num2date(time_var[:], time_var.units)
newtime = [fdate.strftime('%Y-%m-%d') for fdate in dtime]
rawvalue = nc.variables['tx_max'][:]
lathash = {}
lonhash = {}
entry1 = 0
entry2 = 0
lattemp = nc.variables['lat'][:].tolist()
for entry1 in range(lat.size):
lathash[entry1] = lattemp[entry1]
lontemp = nc.variables['lon'][:].tolist()
for entry2 in range(lon.size):
lonhash[entry2] = lontemp[entry2]
for timestep in range(dtime.size):
print(str(time.ctime()) + " " + str(timestep + 1) + "/180")
for _lon in range(lon.size):
for _lat in range(lat.size):
latitude = round(lathash[_lat], 6)
longitude = round(lonhash[_lon], 6)
thedate = newtime[timestep]
thevalue = round(float(rawvalue.data[timestep, _lat, _lon] - 273.15), 3)
if (thevalue > -100):
cur.execute("INSERT INTO table_name (thedate, thepoint, thevalue) VALUES (%s, ST_MakePoint(%s,%s,0), %s)",(thedate, longitude, latitude, thevalue))
db1.commit()
cur.close()
db1.close()
print(" Done!")
If you're certain most of the time is spent in PostgreSQL, and not in any other code of your own, you may want to look at the fast execution helpers, namely cur.execute_values() in your case.
Also, you may want to make sure you're in a transaction, so the database doesn't fall back to an autocommit mode. ("If you do not issue a BEGIN command, then each individual statement has an implicit BEGIN and (if successful) COMMIT wrapped around it.")
Something like this could do the trick -- not tested though.
for timestep in range(dtime.size):
print(str(time.ctime()) + " " + str(timestep + 1) + "/180")
values = []
cur.execute("BEGIN")
for _lon in range(lon.size):
for _lat in range(lat.size):
latitude = round(lathash[_lat], 6)
longitude = round(lonhash[_lon], 6)
thedate = newtime[timestep]
thevalue = round(
float(rawvalue.data[timestep, _lat, _lon] - 273.15), 3
)
if thevalue > -100:
values.append((thedate, longitude, latitude, thevalue))
psycopg2.extras.execute_values(
cur,
"INSERT INTO table_name (thedate, thepoint, thevalue) VALUES %s",
values,
template="(%s, ST_MakePoint(%s,%s,0), %s)"
)
db1.commit()

How to use python to ETL between databases?

Using psycopg2, I'm able to select data from a table in one PostgreSQL database connection and INSERT it into a table in a second PostgreSQL database connection.
However, I'm only able to do it by setting the exact feature I want to extract, and writing out separate variables for each column I'm trying to insert.
Does anyone know of a good practice for either:
moving an entire table between databases, or
iterating through features while not having to declare variables for every column you want to move
or...?
Here's the script I'm currently using where you can see the selection of a specific feature, and the creation of variables (it works, but this is not a practical method):
import psycopg2
connDev = psycopg2.connect("host=host1 dbname=dbname1 user=postgres password=*** ")
connQa = psycopg2.connect("host=host2 dbname=dbname2 user=postgres password=*** ")
curDev = connDev.cursor()
curQa = connQa.cursor()
sql = ('INSERT INTO "tempHoods" (nbhd_name, geom) values (%s, %s);')
curDev.execute('select cast(geom as varchar) from "CCD_Neighborhoods" where nbhd_id = 11;')
tempGeom = curDev.fetchone()
curDev.execute('select nbhd_name from "CCD_Neighborhoods" where nbhd_id = 11;')
tempName = curDev.fetchone()
data = (tempName, tempGeom)
curQa.execute (sql, data)
#commit transactions
connDev.commit()
connQa.commit()
#close connections
curDev.close()
curQa.close()
connDev.close()
connQa.close()
One other note is that python allows the ability to explicitly work with SQL functions / data type casting, which for us is important as we work with the GEOMETRY data type. Above you can see I'm casting it to TEXT then dumping it into an existing geometry column in the source table - this will work with MSSQL Server, which is a huge feature in the geospatial community...
In your solution (your solution and your question have a different order of statements) change the lines which start with 'sql = ' and the loop before '#commit transactions' comment to
sql_insert = 'INSERT INTO "tempHoods" (nbhd_id, nbhd_name, typology, notes, geom) values '
sql_values = ['(%s, %s, %s, %s, %s)']
data_values = []
# you can make this larger if you want
# ...try experimenting to see what works best
batch_size = 100
sql_stmt = sql_insert + ','.join(sql_values*batch_size) + ';'
for i, row in enumerate(rows, 1):
data_values += row[:5]
if i % batch_size == 0:
curQa.execute (sql_stmt , data_values )
data_values = []
if (i % batch_size != 0):
sql_stmt = sql_insert + ','.join(sql_values*(i % batch_size)) + ';'
curQa.execute (sql_stmt , data_values )
BTW, I don't think you need to commit. You don't begin any transactions. So there should not be any need to commit them. Certainly, you don't need to commit a cursor if all you did was a bunch of selects on it.
Here's my updated code based on Dmitry's brilliant solution:
import psycopg2
connDev = psycopg2.connect("host=host1 dbname=dpspgisdev user=postgres password=****")
connQa = psycopg2.connect("host=host2 dbname=dpspgisqa user=postgres password=****")
curDev = connDev.cursor()
curQa = connQa.cursor()
print "Truncating Source"
curQa.execute('delete from "tempHoods"')
connQa.commit()
#Get Data
curDev.execute('select nbhd_id, nbhd_name, typology, notes, cast(geom as varchar) from "CCD_Neighborhoods";') #cast geom to varchar and insert into geometry column!
rows = curDev.fetchall()
sql_insert = 'INSERT INTO "tempHoods" (nbhd_id, nbhd_name, typology, notes, geom) values '
sql_values = ['(%s, %s, %s, %s, %s)'] #number of columns selecting / inserting
data_values = []
batch_size = 1000 #customize for size of tables...
sql_stmt = sql_insert + ','.join(sql_values*batch_size) + ';'
for i, row in enumerate(rows, 1):
data_values += row[:5] #relates to number of columns (%s)
if i % batch_size == 0:
curQa.execute (sql_stmt , data_values )
connQa.commit()
print "Inserting..."
data_values = []
if (i % batch_size != 0):
sql_stmt = sql_insert + ','.join(sql_values*(i % batch_size)) + ';'
curQa.execute (sql_stmt, data_values)
print "Last Values..."
connQa.commit()
# close connections
curDev.close()
curQa.close()
connDev.close()
connQa.close()

Build a dynamic update query in psycopg2

I have to construct a dynamic update query for postgresql.
Its dynamic, because beforehand I have to determine which columns to update.
Given a sample table:
create table foo (id int, a int, b int, c int)
Then I will construct programmatically the "set" clause
_set = {}
_set['a'] = 10
_set['c'] = NULL
After that I have to build the update query. And here I'm stuck.
I have to construct this sql Update command:
update foo set a = 10, b = NULL where id = 1
How to do this with the psycopg2 parametrized command? (i.e. looping through the dict if it is not empty and build the set clause) ?
UPDATE
While I was sleeping I have found the solution by myself. It is dynamic, exactly how I wanted to be :-)
create table foo (id integer, a integer, b integer, c varchar)
updates = {}
updates['a'] = 10
updates['b'] = None
updates['c'] = 'blah blah blah'
sql = "upgrade foo set %s where id = %s" % (', '.join("%s = %%s" % u for u in updates.keys()), 10)
params = updates.values()
print cur.mogrify(sql, params)
cur.execute(sql, params)
And the result is what and how I needed (especially the nullable and quotable columns):
"upgrade foo set a = 10, c = 'blah blah blah', b = NULL where id = 10"
There is actually a slightly cleaner way to make it, using the alternative column-list syntax:
sql_template = "UPDATE foo SET ({}) = %s WHERE id = {}"
sql = sql_template.format(', '.join(updates.keys()), 10)
params = (tuple(addr_dict.values()),)
print cur.mogrify(sql, params)
cur.execute(sql, params)
Using psycopg2.sql – SQL string composition module
The module contains objects and functions useful to generate SQL dynamically, in a convenient and safe way.
from psycopg2 import connect, sql
conn = connect("dbname=test user=postgres")
upd = {'name': 'Peter', 'age': 35, 'city': 'London'}
ref_id = 12
sql_query = sql.SQL("UPDATE people SET {data} WHERE id = {id}").format(
data=sql.SQL(', ').join(
sql.Composed([sql.Identifier(k), sql.SQL(" = "), sql.Placeholder(k)]) for k in upd.keys()
),
id=sql.Placeholder('id')
)
upd.update(id=ref_id)
with conn:
with conn.cursor() as cur:
cur.execute(sql_query, upd)
conn.close()
Running print(sql_query.as_string(conn)) before closing connection will reveal this output:
UPDATE people SET "name" = %(name)s, "age" = %(age)s, "city" = %(city)s WHERE id = %(id)s
No need for dynamic SQL. Supposing a is not nullable and b is nullable.
If you want to update both a and b:
_set = dict(
id = 1,
a = 10,
b = 20, b_update = 1
)
update = """
update foo
set
a = coalesce(%(a)s, a), -- a is not nullable
b = (array[b, %(b)s])[%(b_update)s + 1] -- b is nullable
where id = %(id)s
"""
print cur.mogrify(update, _set)
cur.execute(update, _set)
Output:
update foo
set
a = coalesce(10, a), -- a is not nullable
b = (array[b, 20])[1 + 1] -- b is nullable
where id = 1
If you want to update none:
_set = dict(
id = 1,
a = None,
b = 20, b_update = 0
)
Output:
update foo
set
a = coalesce(NULL, a), -- a is not nullable
b = (array[b, 20])[0 + 1] -- b is nullable
where id = 1
An option without python format using psycopg2's AsIs function for column names (although that doesn't prevent you from SQL injection over column names). Dict is named data:
update_statement = f'UPDATE foo SET (%s) = %s WHERE id_column=%s'
columns = data.keys()
values = [data[column] for column in columns]
query = cur.mogrify(update_statement, (AsIs(','.join(columns)), tuple(values), id_value))
Here's my solution that I have within a generic DatabaseHandler class that provides a lot of flexibility when using pd.DataFrame as your source.
def update_data(
self,
table: str,
df: pd.DataFrame,
indexes: Optional[list] = None,
column_map: Optional[dict] = None,
commit: Optional[bool] = False,
) -> int:
"""Update data in the media database
Args:
table (str): the "tablename" or "namespace.tablename"
df (pandas.DataFrame): dataframe containing the data to update
indexes (list): the list of columns in the table that will be in the WHERE clause of the update statement.
If not provided, will use df indexes.
column_map (dict): dictionary mapping the columns in df to the columns in the table
columns in the column_map that are also in keys will not be updated
Key = df column.
Value = table column.
commit (bool): if True, the transaction will be committed (default=False)
Notes:
If using a column_map, only the columns in the data_map will be updated or used as indexes.
Order does not matter. If not using a column_map, all columns in df must exist in table.
Returns:
int : rows updated
"""
try:
if not indexes:
# Use the dataframe index instead
indexes = []
for c in df.index.names:
if not c:
raise Exception(
f"Dataframe contains indexes without names. Unable to determine update where clause."
)
indexes.append(c)
update_strings = []
tdf = df.reset_index()
if column_map:
target_columns = [c for c in column_map.keys() if c not in indexes]
else:
column_map = {c: c for c in tdf.columns}
target_columns = [c for c in df.columns if c not in indexes]
for i, r in tdf.iterrows():
upd_params = ", ".join(
[f"{column_map[c]} = %s" for c in target_columns]
)
upd_list = [r[c] if pd.notna(r[c]) else None for c in target_columns]
upd_str = self._cur.mogrify(upd_params, upd_list).decode("utf-8")
idx_params = " AND ".join([f"{column_map[c]} = %s" for c in indexes])
idx_list = [r[c] if pd.notna(r[c]) else None for c in indexes]
idx_str = self._cur.mogrify(idx_params, idx_list).decode("utf-8")
update_strings.append(f"UPDATE {table} SET {upd_str} WHERE {idx_str};")
full_update_string = "\n".join(update_strings)
print(full_update_string) # Debugging
self._cur.execute(full_update_string)
rowcount = self._cur.rowcount
if commit:
self.commit()
return rowcount
except Exception as e:
self.rollback()
raise e
Example usages:
>>> df = pd.DataFrame([
{'a':1,'b':'asdf','c':datetime.datetime.now()},
{'a':2,'b':'jklm','c':datetime.datetime.now()}
])
>>> cls.update_data('my_table', df, indexes = ['a'])
UPDATE my_table SET b = 'asdf', c = '2023-01-17T22:13:37.095245'::timestamp WHERE a = 1;
UPDATE my_table SET b = 'jklm', c = '2023-01-17T22:13:37.095250'::timestamp WHERE a = 2;
>>> cls.update_data('my_table', df, indexes = ['a','b'])
UPDATE my_table SET c = '2023-01-17T22:13:37.095245'::timestamp WHERE a = 1 AND b = 'asdf';
UPDATE my_table SET c = '2023-01-17T22:13:37.095250'::timestamp WHERE a = 2 AND b = 'jklm';
>>> cls.update_data('my_table', df.set_index('a'), column_map={'a':'db_a','b':'db_b','c':'db_c'} )
UPDATE my_table SET db_b = 'asdf', db_c = '2023-01-17T22:13:37.095245'::timestamp WHERE db_a = 1;
UPDATE my_table SET db_b = 'jklm', db_c = '2023-01-17T22:13:37.095250'::timestamp WHERE db_a = 2;
Note however that this is not safe from SQL injection due to the way it generates the where clause.

Categories

Resources