Python variable in long SQL string - python

What is a safe way to replace the number in the second-to-last line of this SQL query with a variable?
Say my variable is customer_id. Can I use {} in place of 2 and put .format(customer_id) at the end of this string?
unlicensed_query = """
SELECT SUM(x.quantity), SUM(x.quantity * p.list_price)
FROM (
SELECT cu.customer_id, cu.product_id, cu.quantity
FROM csi_usage cu LEFT JOIN csi c
ON cu.customer_id = c.customer_id
AND cu.product_id = c.product_id
WHERE c.product_id IS NULL
AND cu.customer_id = 2) x, product p
WHERE x.product_id = p.id;
"""

As stated by thebjorn, the correct way to do this is to use bound parameters (http://docs.sqlalchemy.org/en/latest/core/tutorial.html#specifying-bound-parameter-behaviors). An example is here:
from sqlalchemy.sql import text
fully_utilized_query = text("""
SELECT SUM(x.quantity)
FROM (
SELECT cu.customer_id, cu.product_id, cu.quantity
FROM csi_usage cu
JOIN csi c
ON cu.customer_id = c.customer_id
AND cu.product_id = c.product_id
AND cu.quantity = c.licence_qty
WHERE cu.customer_id = :customer_id) x;
""")
fully_utilized = self.session.execute(fully_utilized_query, {'customer_id': current_user.customer_id}).scalar()

Related

python peewee dynamically or + and clauses

I'd like to do a and clause with two lists of multiple or clauses from the same table.
The problem with the following code is, that the query result is empty. If I just select 'indices' or 'brokers', the result is fine.
...
query = query.join(StockGroupTicker, on=(Ticker.id == StockGroupTicker.ticker))
# indices
if "indices" in filter:
where_indices = []
for f in filter["indices"]:
where_indices.append(StockGroupTicker.stock_index == int(f))
if len(where_indices):
query = query.where(peewee.reduce(peewee.operator.or_, where_indices))
# broker
if "brokers" in filter:
where_broker = []
for f in filter["brokers"]:
where_broker.append(StockGroupTicker.stock_index == int(f))
if len(where_broker):
query = query.where(peewee.reduce(peewee.operator.or_, where_broker))
return query.distinct()
SQL Querie (update)
# index and brocker
SELECT
DISTINCT `t1`.`id`,
`t1`.`symbol`,
`t1`.`type`,
`t1`.`name`,
`t1`.`sector`,
`t1`.`region`,
`t1`.`primary_exchange`,
`t1`.`currency`,
`t1`.`score`,
`t1`.`last_price`,
`t1`.`last_price_date`,
`t1`.`last_price_check`,
`t1`.`last_stock_split`,
`t1`.`next_earning`,
`t1`.`last_earnings_update`,
`t1`.`disused`,
`t1`.`source`,
`t1`.`source_intraday`,
`t1`.`created`,
`t1`.`modified`,
`t2`.`invest_score` AS `invest_score`
FROM
`ticker` AS `t1`
INNER JOIN `tickerstats` AS `t2` ON
(`t1`.`id` = `t2`.`ticker_id`)
INNER JOIN `stockgroupticker` AS `t3` ON
(`t1`.`id` = `t3`.`ticker_id`)
WHERE
(((((`t1`.`disused` IS NULL)
OR (`t1`.`disused` = 0))
AND (`t2`.`volume_mean_5` > 10000.0))
AND (`t3`.`stock_index_id` = 1))
AND (`t3`.`stock_index_id` = 10)
)
Thanks to #coleifer, the peewee solution is quite simple. I had to use an alias.
if "indices" in filter and filter["indices"]:
query = query.join(
StockGroupTicker, peewee.JOIN.INNER, on=(Ticker.id == StockGroupTicker.ticker)
)
where_indices = []
for f in filter["indices"]:
where_indices.append(StockGroupTicker.stock_index == int(f))
if len(where_indices):
query = query.where(peewee.reduce(peewee.operator.or_, where_indices))
if "brokers" in filter and filter["brokers"]:
BrokerGroupTicker = StockGroupTicker.alias()
query = query.join(
BrokerGroupTicker, peewee.JOIN.INNER, on=(Ticker.id == BrokerGroupTicker.ticker)
)
where_broker = []
for f in filter["brokers"]:
where_broker.append(BrokerGroupTicker.stock_index == int(f))
if len(where_broker):
query = query.where(peewee.reduce(peewee.operator.or_, where_broker))
return query.distinct()

read_sql query returns an empty dataframe after I pass parameters as a dict in python pandas

I am trying to parameterize some parts of a SQL Query using the below dictionary:
query_params = dict(
{'target':'status',
'date_from':'201712',
'date_to':'201805',
'drform_target':'NPA'
})
sql_data_sample = str("""select *
from table_name
where dt = %(date_to)s
and %(target)s in (%(drform_target)s)
----------------------------------------------------
union all
----------------------------------------------------
(select *,
from table_name
where dt = %(date_from)s
and %(target)s in ('ACT')
order by random() limit 50000);""")
df_data_sample = pd.read_sql(sql_data_sample,con = cnxn,params = query_params)
However this returns a dataframe with no records at all. I am not sure what the error is since no error is being thrown.
df_data_sample.shape
Out[7]: (0, 1211)
The final PostgreSql query would be:
select *
from table_name
where dt = '201805'
and status in ('NPA')
----------------------------------------------------
union all
----------------------------------------------------
(select *
from table_name
where dt = '201712'
and status in ('ACT')
order by random() limit 50000);-- This part of random() is only for running it on my local and not on server.
Below is a small sample of data for replication. The original data has more than a million records and 1211 columns
service_change_3m service_change_6m dt grp_m2 status
0 -2 201805 $50-$75 NPA
0 0 201805 < $25 NPA
0 -1 201805 $175-$200 ACT
0 0 201712 $150-$175 ACT
0 0 201712 $125-$150 ACT
-1 1 201805 $50-$75 NPA
Can someone please help me with this?
UPDATE:
Based on suggestion by #shmee.. I am finally using :
target = 'status'
query_params = dict(
{
'date_from':'201712',
'date_to':'201805',
'drform_target':'NPA'
})
sql_data_sample = str("""select *
from table_name
where dt = %(date_to)s
and {0} in (%(drform_target)s)
----------------------------------------------------
union all
----------------------------------------------------
(select *,
from table_name
where dt = %(date_from)s
and {0} in ('ACT')
order by random() limit 50000);""").format(target)
df_data_sample = pd.read_sql(sql_data_sample,con = cnxn,params = query_params)
Yes, I am quite confident that your issue results from trying to set column names in your query via parameter binding (and %(target)s in ('ACT')) as mentioned in the comments.
This results in your query restricting the result set to records where 'status' in ('ACT') (i.e. Is the string 'status' an element of a list containing only the string 'ACT'?). This is, of course, false, hence no record gets selected and you get an empty result.
This should work as expected:
import psycopg2.sql
col_name = 'status'
table_name = 'public.churn_data'
query_params = {'date_from':'201712',
'date_to':'201805',
'drform_target':'NPA'
}
sql_data_sample = """select *
from {0}
where dt = %(date_to)s
and {1} in (%(drform_target)s)
----------------------------------------------------
union all
----------------------------------------------------
(select *
from {0}
where dt = %(date_from)s
and {1} in ('ACT')
order by random() limit 50000);"""
sql_data_sample = sql.SQL(sql_data_sample).format(sql.Identifier(table_name),
sql.Identifier(col_name))
df_data_sample = pd.read_sql(sql_data_sample,con = cnxn,params = query_params)

python traverse CTE in a double for loop?

I have 2 for loops within each-other. For each row 'A', 'B', 'C' in loop1, I need to access the hierarchical tree to find all the parents of a group 'X' in loop2. This makes me use CTE where I need to find the path for each row separately. Using CTE in a loop is not a solution for sure where I can match for each group id. Referred this link, but could not make out much Looping hierarchy CTE
Code snippet for the cron job using flask framework:
s = select([rt_issues]).\
where(
and_(
rt_issues.c.status !='Closed',
rt_issues.c.assigned_to != None
))
rs = conn.execute(s)
if rs.rowcount > 0:
s4 = text('with recursive rec_grp as(select id, parent_id, name, head, 1 as level, array[id] as path_info from groups union all select grp1.id, grp1.parent_id, grp1.name, grp1.head, rc.level + 1, rc.path_info||grp1.id from groups grp1 join rec_grp rc on grp1.id = rc.parent_id) select distinct id, parent_id, name, head, path_info from rec_grp order by id')
rs4 = conn.execute(s4)
for r in rs:
head_list = []
hierarchical_grps = []
for rr in rs4:
if ((rr['path_info'][0] == r[rt_issues.c.assignee_group])):
for g in rr['path_info']:
hierarchical_grps.append(g)
hierarchical_grps = list(set(hierarchical_grps))
send_pending_mail(hierarchical_grps, r['id'])
print hierarchical_grps, 'hierarchical_grps'
exit(0)
I need to send mail to all the group heads for the assignee_group in the hierarchy for the issue. How can this be achieved. How to use the loops correctly? I am using sqlalchemy core only, postgresql, python with flask. I need the exact code for the same.
What works is the snippet below:
mgroup = None
s = select([rt_issues]).\
where(
and_(
rt_issues.c.status !='Closed',
rt_issues.c.assigned_to != None
))
rs = conn.execute(s)
if rs.rowcount > 0:
for r in rs:
head_list = []
hierarchical_grps = []
mgroup = r[rt_issues.c.assignee_group]
s4 = text('with recursive rec_grp as(select id, parent_id, name, head, 1 as level, array[id] as path_info from groups where id=' +str(mgroup) + 'union all select grp1.id, grp1.parent_id, grp1.name, grp1.head, rc.level + 1, rc.path_info||grp1.id from groupsgrp1 join rec_grp rc on grp1.id = rc.parent_id) select distinct id,parent_id, name, head, path_info from rec_grp order by id')
rs4 = conn.execute(s4)
for rr in rs4:
if ((rr['path_info'][0] == r[rt_issues.c.assignee_group])):
for g in rr['path_info']:
hierarchical_grps.append(g)
hierarchical_grps = list(set(hierarchical_grps))
print hierarchical_grps, 'hierarchical_grps'
send_pending_mail(hierarchical_grps, r['id'])
exit(0)
Assuming that the head column is boolean, this will collect the groups with the head flag set:
rs4 = con.execute(s4)
for rr in rs4:
if rr['head']:
head_list.append(rr['id'])
print 'group heads:', head_list
This is assuming the query from your second example is used (note the correction to the from clause, "from groupsgrp1" should be "from groups grp1"):
WITH RECURSIVE rec_grp AS (
SELECT
id,
parent_id,
name,
head,
1 AS level,
ARRAY [id] AS path_info
FROM groups
WHERE id = 4
UNION ALL
SELECT
grp1.id,
grp1.parent_id,
grp1.name,
grp1.head,
rc.level + 1,
rc.path_info || grp1.id
FROM groups grp1
JOIN rec_grp rc ON grp1.id = rc.parent_id
)
SELECT DISTINCT
id,
parent_id,
name,
head,
path_info
FROM rec_grp
ORDER BY id;

Build a dynamic update query in psycopg2

I have to construct a dynamic update query for postgresql.
Its dynamic, because beforehand I have to determine which columns to update.
Given a sample table:
create table foo (id int, a int, b int, c int)
Then I will construct programmatically the "set" clause
_set = {}
_set['a'] = 10
_set['c'] = NULL
After that I have to build the update query. And here I'm stuck.
I have to construct this sql Update command:
update foo set a = 10, b = NULL where id = 1
How to do this with the psycopg2 parametrized command? (i.e. looping through the dict if it is not empty and build the set clause) ?
UPDATE
While I was sleeping I have found the solution by myself. It is dynamic, exactly how I wanted to be :-)
create table foo (id integer, a integer, b integer, c varchar)
updates = {}
updates['a'] = 10
updates['b'] = None
updates['c'] = 'blah blah blah'
sql = "upgrade foo set %s where id = %s" % (', '.join("%s = %%s" % u for u in updates.keys()), 10)
params = updates.values()
print cur.mogrify(sql, params)
cur.execute(sql, params)
And the result is what and how I needed (especially the nullable and quotable columns):
"upgrade foo set a = 10, c = 'blah blah blah', b = NULL where id = 10"
There is actually a slightly cleaner way to make it, using the alternative column-list syntax:
sql_template = "UPDATE foo SET ({}) = %s WHERE id = {}"
sql = sql_template.format(', '.join(updates.keys()), 10)
params = (tuple(addr_dict.values()),)
print cur.mogrify(sql, params)
cur.execute(sql, params)
Using psycopg2.sql – SQL string composition module
The module contains objects and functions useful to generate SQL dynamically, in a convenient and safe way.
from psycopg2 import connect, sql
conn = connect("dbname=test user=postgres")
upd = {'name': 'Peter', 'age': 35, 'city': 'London'}
ref_id = 12
sql_query = sql.SQL("UPDATE people SET {data} WHERE id = {id}").format(
data=sql.SQL(', ').join(
sql.Composed([sql.Identifier(k), sql.SQL(" = "), sql.Placeholder(k)]) for k in upd.keys()
),
id=sql.Placeholder('id')
)
upd.update(id=ref_id)
with conn:
with conn.cursor() as cur:
cur.execute(sql_query, upd)
conn.close()
Running print(sql_query.as_string(conn)) before closing connection will reveal this output:
UPDATE people SET "name" = %(name)s, "age" = %(age)s, "city" = %(city)s WHERE id = %(id)s
No need for dynamic SQL. Supposing a is not nullable and b is nullable.
If you want to update both a and b:
_set = dict(
id = 1,
a = 10,
b = 20, b_update = 1
)
update = """
update foo
set
a = coalesce(%(a)s, a), -- a is not nullable
b = (array[b, %(b)s])[%(b_update)s + 1] -- b is nullable
where id = %(id)s
"""
print cur.mogrify(update, _set)
cur.execute(update, _set)
Output:
update foo
set
a = coalesce(10, a), -- a is not nullable
b = (array[b, 20])[1 + 1] -- b is nullable
where id = 1
If you want to update none:
_set = dict(
id = 1,
a = None,
b = 20, b_update = 0
)
Output:
update foo
set
a = coalesce(NULL, a), -- a is not nullable
b = (array[b, 20])[0 + 1] -- b is nullable
where id = 1
An option without python format using psycopg2's AsIs function for column names (although that doesn't prevent you from SQL injection over column names). Dict is named data:
update_statement = f'UPDATE foo SET (%s) = %s WHERE id_column=%s'
columns = data.keys()
values = [data[column] for column in columns]
query = cur.mogrify(update_statement, (AsIs(','.join(columns)), tuple(values), id_value))
Here's my solution that I have within a generic DatabaseHandler class that provides a lot of flexibility when using pd.DataFrame as your source.
def update_data(
self,
table: str,
df: pd.DataFrame,
indexes: Optional[list] = None,
column_map: Optional[dict] = None,
commit: Optional[bool] = False,
) -> int:
"""Update data in the media database
Args:
table (str): the "tablename" or "namespace.tablename"
df (pandas.DataFrame): dataframe containing the data to update
indexes (list): the list of columns in the table that will be in the WHERE clause of the update statement.
If not provided, will use df indexes.
column_map (dict): dictionary mapping the columns in df to the columns in the table
columns in the column_map that are also in keys will not be updated
Key = df column.
Value = table column.
commit (bool): if True, the transaction will be committed (default=False)
Notes:
If using a column_map, only the columns in the data_map will be updated or used as indexes.
Order does not matter. If not using a column_map, all columns in df must exist in table.
Returns:
int : rows updated
"""
try:
if not indexes:
# Use the dataframe index instead
indexes = []
for c in df.index.names:
if not c:
raise Exception(
f"Dataframe contains indexes without names. Unable to determine update where clause."
)
indexes.append(c)
update_strings = []
tdf = df.reset_index()
if column_map:
target_columns = [c for c in column_map.keys() if c not in indexes]
else:
column_map = {c: c for c in tdf.columns}
target_columns = [c for c in df.columns if c not in indexes]
for i, r in tdf.iterrows():
upd_params = ", ".join(
[f"{column_map[c]} = %s" for c in target_columns]
)
upd_list = [r[c] if pd.notna(r[c]) else None for c in target_columns]
upd_str = self._cur.mogrify(upd_params, upd_list).decode("utf-8")
idx_params = " AND ".join([f"{column_map[c]} = %s" for c in indexes])
idx_list = [r[c] if pd.notna(r[c]) else None for c in indexes]
idx_str = self._cur.mogrify(idx_params, idx_list).decode("utf-8")
update_strings.append(f"UPDATE {table} SET {upd_str} WHERE {idx_str};")
full_update_string = "\n".join(update_strings)
print(full_update_string) # Debugging
self._cur.execute(full_update_string)
rowcount = self._cur.rowcount
if commit:
self.commit()
return rowcount
except Exception as e:
self.rollback()
raise e
Example usages:
>>> df = pd.DataFrame([
{'a':1,'b':'asdf','c':datetime.datetime.now()},
{'a':2,'b':'jklm','c':datetime.datetime.now()}
])
>>> cls.update_data('my_table', df, indexes = ['a'])
UPDATE my_table SET b = 'asdf', c = '2023-01-17T22:13:37.095245'::timestamp WHERE a = 1;
UPDATE my_table SET b = 'jklm', c = '2023-01-17T22:13:37.095250'::timestamp WHERE a = 2;
>>> cls.update_data('my_table', df, indexes = ['a','b'])
UPDATE my_table SET c = '2023-01-17T22:13:37.095245'::timestamp WHERE a = 1 AND b = 'asdf';
UPDATE my_table SET c = '2023-01-17T22:13:37.095250'::timestamp WHERE a = 2 AND b = 'jklm';
>>> cls.update_data('my_table', df.set_index('a'), column_map={'a':'db_a','b':'db_b','c':'db_c'} )
UPDATE my_table SET db_b = 'asdf', db_c = '2023-01-17T22:13:37.095245'::timestamp WHERE db_a = 1;
UPDATE my_table SET db_b = 'jklm', db_c = '2023-01-17T22:13:37.095250'::timestamp WHERE db_a = 2;
Note however that this is not safe from SQL injection due to the way it generates the where clause.

Sqlalchemy: How to perform outer join with itself?

I want to perform this SQL query using Sqlalchemy (with model Evaluation):
select e1.user, sum(e1.points) as s from
(select e1.*
from evaluations e1 left outer join evaluations e2
on (e1.user = e2.user and e1.module = e2.module and e1.time < e2.time)
where e2.user is null and e1.module in (__another subquery__))
group by e1.user order by s limit 5
I don't know how to perform left outer join (especialy the renaming and referencing of renamed comlumns). Could you help me?
# sample sub-query for testing
_another_query = session.query(Evaluation.module).filter(Evaluation.module > 3)
# define aliases
E1 = aliased(Evaluation, name="e1")
E2 = aliased(Evaluation, name="e2")
# inner query
sq = (
session
# .query(E1)
# select columns explicitely to control labels
.query(E1.user.label("user"), E1.points.label("points"))
.outerjoin(E2, and_(
E1.user == E2.user,
E1.module == E2.module,
E1.time < E2.time,
))
.filter(E2.user == None)
.filter(E1.module.in_(_another_query))
)
sq = sq.subquery(name="sq")
# now lets group by
q = (
session
.query(sq.c.user, func.sum(sq.c.points))
.group_by(sq.c.user)
.order_by(func.sum(sq.c.points))
.limit(5)
)

Categories

Resources