min value of a column in dataframe - python

I have the following code :
from __future__ import division
import pyodbc
import csv
import pandas as pd
import numpy as np
count = 1
dsn = "DRIVER={SQL Server};server=XXXX;database=ABCD"
conn = pyodbc.connect(dsn)
cursor = conn.cursor()
#policy = cursor.execute("select distinct client_num, py_type, py_num, max(ex_date_n) as ex_date_n, max(ef_date_n) as ef_date_n from dbo.policy group by client_num, py_type, py_num")
policy = cursor.execute("select distinct client_num, py_type, py_num, max(ex_date_n) as ex_date_n,max(ef_date_n) as ef_date_n from dbo.policy where client_num = 62961 and py_type = 'A' and py_num = '01' group by client_num, py_type, py_num")
results1 = cursor.fetchall()
for row in results1:
pol_client_num = row.client_num.strip()
pol_py_type = row.py_type.strip()
pol_py_num = row.py_num.strip()
pol_number = pol_client_num+pol_py_type+pol_py_num
pol_exp_date = row.ex_date_n
pol_eff_date = row.ef_date_n
#related = cursor.execute("select distinct a.client_num,a.py_type,a.py_num,a.rclient_num,a.py_rtype,a.rpy_num from policy_replace a where a.client_num = "+pol_client_num+" and a.py_type = '"+pol_py_type+"' and a.py_num = '"+pol_py_num+"'")
related = cursor.execute("select distinct a.client_num,a.py_type,a.py_num,a.rclient_num,a.py_rtype,a.rpy_num from policy_replace a where a.client_num = 62961 and a.py_type = 'A' and a.py_num = 01")
results2 = cursor.fetchall()
for row in results2:
rel_client_num = row.rclient_num.strip()
rel_py_type = row.py_rtype.strip()
rel_py_num = row.rpy_num.strip()
rel_pol_number = rel_client_num+rel_py_type+rel_py_num
related_dates = cursor.execute("select max(b.ex_date_n) as ex_date_n, b.ef_date_n from policy b where b.ex_date_n >= 20200225 and b.client_num = "+rel_client_num+" and b.py_type = '"+rel_py_type+"' and b.py_num = '"+rel_py_num+"' group by b.ef_date_n")
#related_dates = cursor.execute("select max(b.ex_date_n) as ex_date_n, b.ef_date_n from policy b where b.client_num = 37916 and b.py_type = 'F' and b.py_num = 05 group by b.ef_date_n")
results3 = cursor.fetchall()
for row in results3:
rel_exp_date = row.ex_date_n
rel_eff_date = row.ef_date_n
final_result1 = (pol_number,pol_exp_date,pol_eff_date,rel_pol_number,rel_exp_date,rel_eff_date)
df = pd.DataFrame(final_result1)
df = df.transpose()
df.columns = ['pol_number','pol_exp_date','pol_eff_date','rel_pol_number','rel_exp_date','rel_eff_date']
df_grouped = df.groupby('pol_number')['rel_exp_date'].min()
print(df_grouped)
print('done')
On execution, the following is the data output:
For results1,
'62961 ', 'A', '01', 20210429, 20200429
For results2,
('62961 ', 'A', '01', '62961', 'A', '02'),
('62961 ', 'A', '01', '62961', 'A', '03'),
('62961 ', 'A', '01', '63832', 'A', '01')
For results3,
[(20201107, 20191107)]
[(20210407, 20200407)]
[(20200719, 20190719)]
The expected output is as follows:
'69621','A','01',20210429,20200429,'69621','A','02',20201107,20191107,'62961','A','03',20210407,20200407,'63832','A','01',20200719,20190719,'63832','A','01',20200719,20190719
The format of the output required is :
for every row in results1 --- every related row in results2 --- every related date in results3 --- minimum of the exp_date across all rows and the corresponding pol_num/rel_pol_num
This is the reason I am trying to use the df.min() function to get the min across all the exp_date's. But, it doesn't seem to do the job since I am possibly missing something. I also tried axis = 0 as mentioned in the comments but it didn't work. Any direction is appreciated.

Related

Get all data API when inputs are empty

I created my first API where I can get data from my tables in Bigquery.
I can get all the data I need based on the 2 inputs below, but I am also trying to get the whole table when the inputs are empty, which I cannot do.
Thanks for your help
#app.route("/tracking", methods=['GET'])
def tracking_data():
haulier_id_tracking = request.args.get('haulier_id_tracking')
month_tracking = request.args.get('month_tracking')
query_job = bq_client.query("""
WITH t AS (
SELECT *
FROM mart.monthly_vehicle_stats
WHERE dt_fr_month = '{month_tracking}-01' AND (haulier_id_tracking = '{haulier_id_tracking}')
SELECT TO_JSON_STRING(STRUCT(ARRAY_AGG(STRUCT(dt_fr_month, haulier_id_tracking, vehicle_id , nb_days_tracked,
data_access, date_first_camp, invoiced)) AS data)) json
FROM t
""".format(month_tracking = month_tracking, haulier_id_tracking = haulier_id_tracking))
for row in query_job:
return json.loads(row["json"])
You can remove the where clause when the input is empty like this:
#app.route("/tracking", methods=['GET'])
def tracking_data():
haulier_id_tracking = request.args.get('haulier_id_tracking')
month_tracking = request.args.get('month_tracking')
where_clause = ''
if haulier_id_tracking != '' and month_tracking != '':
where_clause = f"WHERE dt_fr_month = '{month_tracking}-01' AND (haulier_id_tracking = '{haulier_id_tracking}'"
query_job = bq_client.query(f"""
WITH t AS (
SELECT * FROM mart.monthly_vehicle_stats {where_clause})
SELECT TO_JSON_STRING(STRUCT(ARRAY_AGG(STRUCT(dt_fr_month, haulier_id_tracking, vehicle_id , nb_days_tracked,
data_access, date_first_camp, invoiced)) AS data)) json
FROM t
""")
for row in query_job:
return json.loads(row["json"])

Return Missing Rows from Python SQL Query

Is there anyway i can compare two different databases (postgresl, sql server) and return the missing rows? I am missing one row in the postgresql table that is not in the sql server one and have no clue how to return that answer to me.
I have two connections opened for postgresql (bpo_table_results) and for sql server(rps_table_results)
postgresql table:
date count amount
1/1/21 500 1,234,654.12
sql server table:
date count amount
1/1/21 500 1,234,654.12
1/2/21 4541 3,457,787.24
expected results:
The row in the amount of 3,457,787.24 is missing from your posgresql table.
code:
def queryRPS(sql_server_conn, sql_server_cursor):
rps_item_count_l = []
rps_icl_amt_l = []
rps_table_q_2 = f"""select * from rps..sendfile where processingdate = '{cd}' and datasetname like '%ICL%' """
rps_table_results = sql_server_cursor.execute(rps_table_q_2).fetchall()
for row in rps_table_results:
rps_item_count = row[16]
rps_item_count_l.append(rps_item_count)
rps_icl_amt = row[18]
rps_icl_amt_l.append(rps_icl_amt)
def queryBPO(postgres_conn, postgres_cursor,rps_item_count_l, rps_icl_amt_l):
bpo_results_l = []
rps_results_l = []
for rps_count, rps_amount in zip(rps_item_count_l, rps_icl_amt_l):
rps_amount_f = str(rps_amount).rstrip('0')
rps_amount_f = ("{:,}".format(float(rps_amount_f)))
bpo_icl_awk_q_2 = """select * from ppc_data.icl_awk where num_items = '%s' and
file_total = '%s' """ % (str(rps_count), str(rps_amount_f))
postgres_cursor.execute(bpo_icl_awk_q_2)
bpo_table_results = postgres_cursor.fetchall()
rps_table_q_2 = f"""select * from rps..sendfile where processingdate = '{cd}' and datasetname like '%ICL%' """
rps_table_results = sql_server_cursor.execute(rps_table_q_2).fetchall()
rps_item_count_l, rps_icl_amt_l = queryRPS(sql_server_conn, sql_server_cursor)
queryBPO(postgres_conn, postgres_cursor, rps_item_count_l, rps_icl_amt_l)

How to pass value of variable into another text file?

I have text files which have sql queries.After running one file "tb_exec_ns_call_pln.txt" i'm getting two dates like- 2018-12-29 ,
2019-03-29.
i just want to pass these dates in other text file (tb_exec_ns_call_actvty.txt) using python. the text file contain the below query-
SELECT a.nm as cycle_nm,
a.start_dt as cycle_start_dt,
a.end_dt as cycle_end_dt,
a.terr as territory,sales_drctn,
x_rating1,
c.jnj_id as jnj_id,
c.prsn_first_nm,
c.prsn_last_nm,
plnnd_calls as rep_goal
FROM eureka.cycle_plan a, eureka.cycle_plan_trgt b, eureka.acct c
WHERE
a.id = b.cycle_plan
and b.acct = c.id
and b.del_flg = 'N'
***and start_dt >= '2018-12-29'***
***and end_dt <= '2019-03-29'***
and substring(a.terr,1,6) in ('106-KS','106-PI','106-VO')
and a.status = 'In_Progress_vod'
and a.del_flg = 'N'
and c.del_flg = 'N' and plnnd_calls > 0
i have written python script also.. Please guide me how to pass the value.
path = "D:/Users/SPate233/Downloads/NS dashboard/tb_exec_ns_call_pln.txt"
sql_query_file = open(path, 'r')
sql_query1 = sql_query_file.read()
cur.execute(sql_query1)
res = cur.fetchall()
print(res)
print(type(res))
for val in res:
print(val[1])
print(val[2])
One approach is to have a string variable hardcoded in to tb_exec_ns_call_actvty.txt and then use str.replace to fill in the required info.
Ex:
SELECT a.nm as cycle_nm,
a.start_dt as cycle_start_dt,
a.end_dt as cycle_end_dt,
a.terr as territory,sales_drctn,
x_rating1,
c.jnj_id as jnj_id,
c.prsn_first_nm,
c.prsn_last_nm,
plnnd_calls as rep_goal
FROM eureka.cycle_plan a, eureka.cycle_plan_trgt b, eureka.acct c
WHERE
a.id = b.cycle_plan
and b.acct = c.id
and b.del_flg = 'N'
and start_dt >= 'START_DT'
and end_dt <= 'END_DT'
and substring(a.terr,1,6) in ('106-KS','106-PI','106-VO')
and a.status = 'In_Progress_vod'
and a.del_flg = 'N'
and c.del_flg = 'N' and plnnd_calls > 0
InCode:
path = "D:/Users/SPate233/Downloads/NS dashboard/tb_exec_ns_call_pln.txt"
with open(path) as sql_query_file:
sql_query1 = sql_query_file.read()
sql_query1 = sql_query1.replace("START_DT", '2018-12-29').replace("END_DT", '2019-03-29')
cur.execute(sql_query1)
res = cur.fetchall()

Panda DataFrame Row Items IF Comparison doesnt return correct result

I retrieve data from quandl and load it to a pandas DF object.
Afterwards I calculate SMA values (SMA21, SMA55) based on "Last Price".
Adding those SMA values as a column do my DF object.
I iterate through DF to catch a buy signal.
I know the buy condition is holding true for some dates but my code does not printing anything out. I am expecting to print the buy condition at the very least.
as below you can see the following condition:
kitem['SMA21'] >= kitem['Last']
My code:
import requests
import pandas as pd
import json
class URL_Params:
def __init__ (self, endPoint, symboll, startDate, endDate, apiKey):
self.endPoint = endPoint
self.symboll = symboll
self.startDate = startDate
self.endDate = endDate
self.apiKey = apiKey
def createURL (self):
return self.endPoint + self.symboll + '?start_date=' + self.startDate + '&end_date=' + self.endDate + '&api_key=' + self.apiKey
def add_url(self, _url):
self.url_list
my_portfolio = {'BTC':1.0, 'XRP':0, 'DSH':0, 'XMR':0, 'TotalBTCValue':1.0}
_endPoint = 'https://www.quandl.com/api/v3/datasets/BITFINEX/'
_symbolls = ['BTCEUR','XRPBTC','DSHBTC','IOTBTC','XMRBTC']
_startDate = '2017-01-01'
_endDate = '2019-03-01'
_apiKey = '' #needs to be set for quandl
my_data = {}
my_conns = {}
my_col_names = ['Date', 'High', 'Low', 'Mid', 'Last', 'Bid', 'Ask', 'Volume']
orderbook = []
#create connection and load data for each pair/market.
#load them in a dict for later use
for idx_symbol in _symbolls:
my_url_params = URL_Params(_endPoint,idx_symbol,_startDate,_endDate,_apiKey)
response = requests.get(my_url_params.createURL())
my_data[idx_symbol] = json.loads(response.text)
#Prepare Data
my_raw_data_df_xrpbtc = pd.DataFrame(my_data['XRPBTC']['dataset']['data'], columns= my_data['XRPBTC']['dataset']['column_names'])
#Set Index to Date Column and Sort
my_raw_data_df_xrpbtc['Date'] = pd.to_datetime(my_raw_data_df_xrpbtc['Date'])
my_raw_data_df_xrpbtc.index = my_raw_data_df_xrpbtc['Date']
my_raw_data_df_xrpbtc = my_raw_data_df_xrpbtc.sort_index()
#Drop unrelated columns
my_raw_data_df_xrpbtc.drop(['Date'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Ask'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Bid'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Low'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['High'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Mid'], axis=1, inplace=True)
#Calculate SMA values to create buy-sell signal
my_raw_data_df_xrpbtc['SMA21'] = my_raw_data_df_xrpbtc['Last'].rolling(21).mean()
my_raw_data_df_xrpbtc['SMA55'] = my_raw_data_df_xrpbtc['Last'].rolling(55).mean()
my_raw_data_df_xrpbtc['SMA200'] = my_raw_data_df_xrpbtc['Last'].rolling(200).mean()
#Check for each day if buy signal holds BUY if sell signal holds SELL
for idx,kitem in my_raw_data_df_xrpbtc.iterrows():
if (kitem['SMA21'] >= kitem['Last']) is True: #buy signal
print("buy0")
if my_portfolio['BTC'] > 0 is True:
print("buy1")
if (kitem['Last'] * my_portfolio['XRP']) >= (my_portfolio['BTC'] * 1.05) is True: #sell signal
print("sell0")
if my_portfolio['XRP'] > 0 is True:
print("sell1")
I know that there are lots of rows that holds true but my code never enters this path of code so it does not print out what I expect.
Could anyone please help/comment what might be wrong?
The reason is that your comparison is wrong. The result of kitem['SMA21'] >= kitem['Last'] will be a numpy.bool_. When you use is to compare it to True this will fail as it is not the same object.
If you change the comparison to == it will work as expected:
if (kitem['SMA21'] >= kitem['Last']) == True:

Build a dynamic update query in psycopg2

I have to construct a dynamic update query for postgresql.
Its dynamic, because beforehand I have to determine which columns to update.
Given a sample table:
create table foo (id int, a int, b int, c int)
Then I will construct programmatically the "set" clause
_set = {}
_set['a'] = 10
_set['c'] = NULL
After that I have to build the update query. And here I'm stuck.
I have to construct this sql Update command:
update foo set a = 10, b = NULL where id = 1
How to do this with the psycopg2 parametrized command? (i.e. looping through the dict if it is not empty and build the set clause) ?
UPDATE
While I was sleeping I have found the solution by myself. It is dynamic, exactly how I wanted to be :-)
create table foo (id integer, a integer, b integer, c varchar)
updates = {}
updates['a'] = 10
updates['b'] = None
updates['c'] = 'blah blah blah'
sql = "upgrade foo set %s where id = %s" % (', '.join("%s = %%s" % u for u in updates.keys()), 10)
params = updates.values()
print cur.mogrify(sql, params)
cur.execute(sql, params)
And the result is what and how I needed (especially the nullable and quotable columns):
"upgrade foo set a = 10, c = 'blah blah blah', b = NULL where id = 10"
There is actually a slightly cleaner way to make it, using the alternative column-list syntax:
sql_template = "UPDATE foo SET ({}) = %s WHERE id = {}"
sql = sql_template.format(', '.join(updates.keys()), 10)
params = (tuple(addr_dict.values()),)
print cur.mogrify(sql, params)
cur.execute(sql, params)
Using psycopg2.sql – SQL string composition module
The module contains objects and functions useful to generate SQL dynamically, in a convenient and safe way.
from psycopg2 import connect, sql
conn = connect("dbname=test user=postgres")
upd = {'name': 'Peter', 'age': 35, 'city': 'London'}
ref_id = 12
sql_query = sql.SQL("UPDATE people SET {data} WHERE id = {id}").format(
data=sql.SQL(', ').join(
sql.Composed([sql.Identifier(k), sql.SQL(" = "), sql.Placeholder(k)]) for k in upd.keys()
),
id=sql.Placeholder('id')
)
upd.update(id=ref_id)
with conn:
with conn.cursor() as cur:
cur.execute(sql_query, upd)
conn.close()
Running print(sql_query.as_string(conn)) before closing connection will reveal this output:
UPDATE people SET "name" = %(name)s, "age" = %(age)s, "city" = %(city)s WHERE id = %(id)s
No need for dynamic SQL. Supposing a is not nullable and b is nullable.
If you want to update both a and b:
_set = dict(
id = 1,
a = 10,
b = 20, b_update = 1
)
update = """
update foo
set
a = coalesce(%(a)s, a), -- a is not nullable
b = (array[b, %(b)s])[%(b_update)s + 1] -- b is nullable
where id = %(id)s
"""
print cur.mogrify(update, _set)
cur.execute(update, _set)
Output:
update foo
set
a = coalesce(10, a), -- a is not nullable
b = (array[b, 20])[1 + 1] -- b is nullable
where id = 1
If you want to update none:
_set = dict(
id = 1,
a = None,
b = 20, b_update = 0
)
Output:
update foo
set
a = coalesce(NULL, a), -- a is not nullable
b = (array[b, 20])[0 + 1] -- b is nullable
where id = 1
An option without python format using psycopg2's AsIs function for column names (although that doesn't prevent you from SQL injection over column names). Dict is named data:
update_statement = f'UPDATE foo SET (%s) = %s WHERE id_column=%s'
columns = data.keys()
values = [data[column] for column in columns]
query = cur.mogrify(update_statement, (AsIs(','.join(columns)), tuple(values), id_value))
Here's my solution that I have within a generic DatabaseHandler class that provides a lot of flexibility when using pd.DataFrame as your source.
def update_data(
self,
table: str,
df: pd.DataFrame,
indexes: Optional[list] = None,
column_map: Optional[dict] = None,
commit: Optional[bool] = False,
) -> int:
"""Update data in the media database
Args:
table (str): the "tablename" or "namespace.tablename"
df (pandas.DataFrame): dataframe containing the data to update
indexes (list): the list of columns in the table that will be in the WHERE clause of the update statement.
If not provided, will use df indexes.
column_map (dict): dictionary mapping the columns in df to the columns in the table
columns in the column_map that are also in keys will not be updated
Key = df column.
Value = table column.
commit (bool): if True, the transaction will be committed (default=False)
Notes:
If using a column_map, only the columns in the data_map will be updated or used as indexes.
Order does not matter. If not using a column_map, all columns in df must exist in table.
Returns:
int : rows updated
"""
try:
if not indexes:
# Use the dataframe index instead
indexes = []
for c in df.index.names:
if not c:
raise Exception(
f"Dataframe contains indexes without names. Unable to determine update where clause."
)
indexes.append(c)
update_strings = []
tdf = df.reset_index()
if column_map:
target_columns = [c for c in column_map.keys() if c not in indexes]
else:
column_map = {c: c for c in tdf.columns}
target_columns = [c for c in df.columns if c not in indexes]
for i, r in tdf.iterrows():
upd_params = ", ".join(
[f"{column_map[c]} = %s" for c in target_columns]
)
upd_list = [r[c] if pd.notna(r[c]) else None for c in target_columns]
upd_str = self._cur.mogrify(upd_params, upd_list).decode("utf-8")
idx_params = " AND ".join([f"{column_map[c]} = %s" for c in indexes])
idx_list = [r[c] if pd.notna(r[c]) else None for c in indexes]
idx_str = self._cur.mogrify(idx_params, idx_list).decode("utf-8")
update_strings.append(f"UPDATE {table} SET {upd_str} WHERE {idx_str};")
full_update_string = "\n".join(update_strings)
print(full_update_string) # Debugging
self._cur.execute(full_update_string)
rowcount = self._cur.rowcount
if commit:
self.commit()
return rowcount
except Exception as e:
self.rollback()
raise e
Example usages:
>>> df = pd.DataFrame([
{'a':1,'b':'asdf','c':datetime.datetime.now()},
{'a':2,'b':'jklm','c':datetime.datetime.now()}
])
>>> cls.update_data('my_table', df, indexes = ['a'])
UPDATE my_table SET b = 'asdf', c = '2023-01-17T22:13:37.095245'::timestamp WHERE a = 1;
UPDATE my_table SET b = 'jklm', c = '2023-01-17T22:13:37.095250'::timestamp WHERE a = 2;
>>> cls.update_data('my_table', df, indexes = ['a','b'])
UPDATE my_table SET c = '2023-01-17T22:13:37.095245'::timestamp WHERE a = 1 AND b = 'asdf';
UPDATE my_table SET c = '2023-01-17T22:13:37.095250'::timestamp WHERE a = 2 AND b = 'jklm';
>>> cls.update_data('my_table', df.set_index('a'), column_map={'a':'db_a','b':'db_b','c':'db_c'} )
UPDATE my_table SET db_b = 'asdf', db_c = '2023-01-17T22:13:37.095245'::timestamp WHERE db_a = 1;
UPDATE my_table SET db_b = 'jklm', db_c = '2023-01-17T22:13:37.095250'::timestamp WHERE db_a = 2;
Note however that this is not safe from SQL injection due to the way it generates the where clause.

Categories

Resources