Storing dataframe in a sqlite database using Flask API - python

So I am fairly new to flask and I am currently trying to create a flask api for a project I am working on. However, there are a couple of issues I am facing.
So for my 1st issue, I can't get my dataframe from the 1st function to work in my second function. I am just wondering how I can get the data_1 to work in the second function.
Code:
from flask import Flask
from sqlalchemy import create_engine
import sqlite3 as sql
import pandas as pd
import datetime
import os
app = Flask(__name__)
#app.route('/', methods=['GET'])
def get_data():
...
data_1 = ...
#print(data_1.head(n=10))
return "hello"
#app.route('/table1', methods=['GET'])
def store_table1_data_df():
db_path = os.path.join(os.path.dirname(__file__),'table1.db')
engine = create_engine('sqlite:///{}'.format(db_path), echo=True)
sqlite_connection = engine.connect()
sqlite_table = 'table1'
data_1.to_sql(sqlite_table,sqlite_connection, if_exists='append')
sqlite_connection.close()
return "table1"
For my second issue, is there a better way of storing a dataframe within flask api using sqlalchemy or sqlite3?
More context as to what kind of data_1 is: data_1 can only hold the past 15 days/records like from 6/15/2021-6/30/2021. However, tomorrow, if I fetch the newest data_1 it will contain 6/16/2021-7/01/2021. How can I just append 07/01/2021 to the old data_1 without creating duplicate records from 06/16/2021, creating two more functions, and an extra db file?
#app.route('/table1', methods=['GET'])
def store_table1_data_df():
db_path = os.path.join(os.path.dirname(__file__),'table1.db')
engine = create_engine('sqlite:///{}'.format(db_path), echo=True)
sqlite_connection = engine.connect()
sqlite_table = 'table1'
data_1.to_sql(sqlite_table,sqlite_connection, if_exists='append')
sqlite_connection.close()
return "table1"
#app.route('/table2', methods=['GET'])
def store_table2_data_df():
db_path2 = os.path.join(os.path.dirname(__file__),'table2.db')
engine2 = create_engine('sqlite:///{}'.format(db_path2), echo=True)
sqlite_connection2 = engine2.connect()
sqlite_table2 = 'table2'
data_1.to_sql(sqlite_table2,sqlite_connection2, if_exists='append')
sqlite_connection2.close()
return "table2"
# What I probably have down below is not the correct way to solve this problem
#app.route('/table1', methods=['GET'])
conn = sql.connect("table1.db")
cur = conn.cursor()
#cur.execute
cur.execute("ATTACH 'table2.db' as 'table2' ")
conn.commit()
table_3 = pd.read_sql_query("SELECT DISTINCT date, value FROM table1 UNION SELECT DISTINCT date, value from table2 ORDER BY date", conn)
cur.execcute("SELECT DISTINCT date, value FROM table1 UNION SELECT DISTINCT date, value from table2 ORDER BY date")
conn.commit()
results3 = cur.fetchall()
sqlite_table='table1'
table_3.to_sql(sqlite_table, conn, if_exists='replace')
cur.close()
conn.close()
return "work"
Any help is greatly appreciated.

For your 1st problem. You may do either of these:
If the size of data-1 is small(than 200kb) you may use flask-session to store the data and access it across routes.
You create a function that returns data_1. Call that function in any route you want. Hint:
def getdata1(val1, val2):
#calculation here
return data_1
Just call this wherever you need data_1.
Store the data frame in a DB and fetch it.
For the second part, a simple for loop will work. Hint on that:
sql_table = ["Fetch your sql table here with the dataframe. Considering dates in one column"]
data_1 = ["Your dataframe"]
for i in data_1['Dates']:
if i != sql_table['dates']:
#insert this key:value pair in sql table
If your data frame and sql is getting loaded in order by date, even better. You just need to check the last elements of each.

Related

Increase speed of SQLAlchemy Insert.execute()

Consider following working code of copy a souce sqlite database to target sqlite database:
# Create two database.
import sqlite3
import pandas as pd
import time
cn_src = sqlite3.connect('source.db')
df=pd.DataFrame({"x":[1,2],"y":[2.0,3.0]})
df.to_sql("A", cn_src, if_exists="replace", index=False)
cn_tgt = sqlite3.connect('target.db')
cn_src.close()
cn_tgt.close()
from sqlalchemy import create_engine, MetaData, event
from sqlalchemy.sql import sqltypes
# create sqlalchemy conneciton
src_engine = create_engine("sqlite:///source.db")
src_metadata = MetaData(bind=src_engine)
exclude_tables = ('sqlite_master', 'sqlite_sequence', 'sqlite_temp_master')
tgt_engine = create_engine("sqlite:///target.db")
tgt_metadata = MetaData(bind=tgt_engine)
#event.listens_for(src_metadata, "column_reflect")
def genericize_datatypes(inspector, tablename, column_dict):
column_dict["type"] = column_dict["type"].as_generic(allow_nulltype=True)
tgt_conn = tgt_engine.connect()
tgt_metadata.reflect()
# delete tables in target database.
for table in reversed(tgt_metadata.sorted_tables):
if table.name not in exclude_tables:
print('dropping table =', table.name)
table.drop()
tgt_metadata.clear()
tgt_metadata.reflect()
src_metadata.reflect()
# copy table
for table in src_metadata.sorted_tables:
if table.name not in exclude_tables:
table.create(bind=tgt_engine)
# Update meta information
tgt_metadata.clear()
tgt_metadata.reflect()
# Copy data
for table in tgt_metadata.sorted_tables:
src_table = src_metadata.tables[table.name]
stmt = table.insert()
for index, row in enumerate(src_table.select().execute()):
print("table =", table.name, "Inserting row", index)
start=time.time()
stmt.execute(row._asdict())
end=time.time()
print(end-start)
The code was mainly borrowed from other source. The problem is the time end-start is about 0.017 in my computer which is too large. Is there any way to speed up? I have tried set isolation_level=None in create_engine but no luck.
It seems like that Insert object has no executemany method so we can't use bulk inserting.
It seems like that Insert object has no executemany method so we can't use bulk inserting.
SQLAlchemy does not implement separate execute() and executemany() methods. Its execute() method looks at the parameters it receives and
if they consist of a single dict object (i.e., a single row) then it calls execute() at the driver level, or
if they consist of a list of dict objects (i.e., multiple rows) then it calls executemany() at the driver level.
Note also that you are using deprecated usage patterns, specifically MetaData(bind=…). You should be doing something more like this:
import sqlalchemy as sa
engine = sa.create_engine("sqlite://")
tbl = sa.Table(
"tbl",
sa.MetaData(),
sa.Column("id", sa.Integer, primary_key=True, autoincrement=False),
sa.Column("txt", sa.String),
)
tbl.create(engine)
with engine.begin() as conn:
stmt = sa.insert(tbl)
params = [
dict(id=1, txt="foo"),
dict(id=2, txt="bar"),
]
conn.execute(stmt, params)
# check results
with engine.begin() as conn:
print(conn.exec_driver_sql("SELECT * FROM tbl").all())
# [(1, 'foo'), (2, 'bar')]
I come up with a solution using transaction:
# Copy data
trans=tgt_conn.begin()
for table in tgt_metadata.sorted_tables:
src_table = src_metadata.tables[table.name]
stmt = table.insert().execution_options(autocommit=False)
for index, row in enumerate(src_table.select().execute()):
tgt_conn.execute(stmt, row._asdict()) # must use tgt_conn.execute(), not stmt.execute()
trans.commit()
tgt_conn.close()

How to fix data binding issue with SqlAlchemy

I am using Python 3.6 with Flask. I am trying to connect to Amazon Redshift database using SqlAlchemy. The query has IN operation in the Where clause and the values for it are sent by binding it to the query. For some reason it does not work? It does not error out, but does not fetch any results. If I hard code the values in the query, it works fine.
I have tried a few suggested options but no luck -
1. binding the values as a list or just as comma separated string
2. removing the brackets in the query
3. changing the code to
stmt = text(file.read())
stmt = stmt.bindparams(search = ids)
df = pd.read_sql_query(stmt, connection)
dtv_script.txt
Select * from tbl1 where id IN (:search)
def get_dt(id_list):
engine = create_engine('postgresql://xxxxxxxxxx')
connection = engine.connect()
ids = list(id_list.split(","))
#dtv_script.txt has the sql
file = open('dtv_script.txt')
sql = text(file.read())
df = pd.read_sql_query(sql, connection, params={'search' : ids})
connection.close()
return df
The ids are posted from a form on the index.html.
Sample ids = 2011592,2021593,2033591.
The flask route page captures it in the get_dt() function and returns the dataframe back to the results.html page for display
#app.route('/result', methods=['POST'])
def result():
if request.method == 'POST':
id_list = request.form.get('ids')
df_dt = dofri.get_dt(id_list)
return render_template('result.html', **locals())
else:
flash('There was some error. Check the logs')
return index()
Below is the solution. Make sure to
add - from sqlalchemy import bindparam
remove - brackets from the query
add - expanding=True
dtv_script.txt
Select * from tbl1 where id IN :search
def get_dt(id_list):
engine = create_engine('postgresql://xxxxxxxxxx')
connection = engine.connect()
ids = list(id_list.split(","))
#dtv_script.txt has the sql
file = open('dtv_script.txt')
sql = file.read()
t = text(sql)
t = t.bindparams(bindparam('search', expanding=True))
df = pd.read_sql_query(t, connection, params={'search' : ids })
connection.close()
return df

Sqlalchemy: add into mysql table new rows from pandas dataframe, if they don't exist already in the table

I created a table inserting data fetched from an api and store in to a pandas dataframe using sqlalchemy.
I am gonna need to query the api, every 4 hours, to get new data.
Problem being that the api, will give me back not only the new data but as well the old ones, already imported in mysql
how can i import just the new data into the mysql table
i retrieved the data from the api, stored the data in to a pandas object, created the connection to the mysql db and created a fresh new table.
import requests
import json
from pandas.io.json import json_normalize
myToken = 'xxx'
myUrl = 'somewebsite'
head = {'Authorization': 'token {}'.format(myToken)}
response = requests.get(myUrl, headers=head)
data=response.json()
#print(data.dumps(data, indent=4, sort_keys=True))
results=json_normalize(data['results'])
results.rename(columns={'datastream.name': 'datastream_name',
'datastream.url':'datastream_url',
'datastream.datastream_type_id':'datastream_id',
'start':'error_date'}, inplace=True)
results_final=pd.DataFrame([results.datastream_name,
results.datastream_url,
results.error_date,
results.datastream_id,
results.message,
results.type_label]).transpose()
from sqlalchemy import create_engine
from sqlalchemy import exc
engine = create_engine('mysql://usr:psw#ip/schema')
con = engine.connect()
results_final.to_sql(name='error',con=con,if_exists='replace')
con.close()
End goal is to insert into the table, just the not existing data coming from the api
You could pull the results already in the database into a new dataframe and then compare the two dataframes. After that you would only insert the rows not in the table. Not knowing the format of your table or data I'm just using a generic SELECT statement here.
from sqlalchemy import create_engine
from sqlalchemy import exc
engine = create_engine('mysql://usr:psw#ip/schema')
con = engine.connect()
sql = "SELECT * FROM table_name"
old_results = pd.read_sql(sql, con)
df = pd.merge(old_results, results_final, how='outer', indicator=True)
new_results = df[df['_merge']=='right_only'][results_final.columns]
new_results.to_sql(name='error',con=con,if_exists='append')
con.close()
You also need to change if_exists to append because set to replace it drops all values in the table and replaces them with the values in the pandas dataframe.
I developed this function to handle both: news values and when columns from the source table and target table are not equal.
def load_data(df):
engine = create_engine('mysql+pymysql://root:pass#localhost/dw', echo_pool=True, pool_size=10, max_overflow=20)
with engine.connect() as conn, conn.begin():
try:
df_old = pd.read_sql('SELECT * FROM table', conn)
# Check if exists new rows to be inserted
if len(df) > len(df_saved) or df.disconnected_time.max() > df_saved.disconnected_time.max():
print("There are new rows to be inserted. ")
df_merged = pd.merge(df_old, df, how='outer', indicator=True)
df_final = df_merged[df_merged['_merge']=='right_only'][df.columns]
df_final.to_sql(name='table',con=conn,index=False, if_exists='append')
except Exception as err:
print (str(err))
else:
# This handling errors when the lengths of the columns are not equal to the target
if df_bulbr.shape[1] > df_old.shape[1]:
data = pd.read_sql('SELECT * FROM table', conn)
df2 = pd.concat([df,data])
df2.to_sql('table', conn, index=False, if_exists='replace')
outcome = conn.execute("select count(1) from table")
countRow = outcome.first()[0]
return print(f" Total of {countRow} rows load." )

Writing dataframe to postgres database

I want to write a pandas dataframe to a postgres table. I make a connection to db as follows:
import psycopg2
import pandas as pd
import sqlalchemy
def connect(user, password, db, host='localhost', port=5432):
'''Returns a connection and a metadata object'''
url = 'postgresql://{}:{}#{}:{}/{}'
url = url.format(user, password, host, port, db)
# The return value of create_engine() is our connection object
con = sqlalchemy.create_engine(url, client_encoding='utf8')
# We then bind the connection to MetaData()
meta = sqlalchemy.MetaData(bind=con, reflect=True)
return con, meta
con, meta = connect('user_name', 'password', 'db_name', host='host_name')
When I read from a table that is already populated, it works fine:
df = pd.read_sql("SELECT * FROM db.table_name limit 10",con=con)
print df
I would like to be able to write df to a table. To test this, I have a temporary table called 'test' with two fields name and age.
# create a temp df
table = [['name', 'age'], ['nameA' , 20], ['nameB', 30]]
headers = table.pop(0)
df = pd.DataFrame(table, columns=headers)
# write to db
df.to_sql('db.test', con, if_exists = 'replace', index=False)
I then check if the temp table is populated:
df = pd.read_sql("SELECT * FROM db.test limit 10",con=con)
print df
I get an empty dataframe! I got no errors when I use df.to_sql but nothing is getting written to the database (?). What am I missing and how do I go about fixing this?
Versions:
Pandas: 0.19.2
Sqlachemy: 1.1.10
Postgres: 9.4.9
I have not figured out why df.to_sql did not write to the table. Writing to table using pd.io.sql.SQLDatabase worked for my test case:
meta = sqlalchemy.MetaData(con, schema='db_name')
meta.reflect()
pdsql = pd.io.sql.SQLDatabase(con, meta=meta)
pdsql.to_sql(df, 'test', if_exists='replace')
I would not consider this THE solution -- I'd be happy to accept better solution or an answer that brings a closure to why df.to_sql() does not behave as expected.

Why the database is not refreshing after delete?

in the DB I have two events:
{1: {'title': 'APPLE'}, 2: {'title': 'BANANA'}}
as you can see bellow, I am calling the deletefromDB function on refresh with the parameter of the second event and I am printing out the data (the json above). My issue is, that the data will change, only after I restart my flask server. Till that point it's the same json output. Can somebody explain what I am doing wrong?
this is my code:
import sqlite3
from flask import Flask, render_template, request
app = Flask(__name__)
data = {}
conn = sqlite3.connect('events.db')
cursor = conn.cursor()
# INIT
def initDB():
with conn:
cursor.execute("SELECT * FROM Events")
rows = cursor.fetchall()
for row in rows:
temp = {}
temp["title"] = row[1]
data[row[0]] = temp
print(data)
initDB()
def deletefromDB(eventID):
query = 'DELETE FROM Events WHERE EventId = {}'.format(eventID)
with conn:
cursor.execute(query)
print(query)
initDB()
#app.route('/')
def index():
deletefromDB(2)
return 'Index Page'
if __name__ == "__main__":
app.run()
You are caching the tables result into the data dict on database initialization. Then you delete a row from the table without committing the changes to the data dict. Either you reload the entire data dict after deletion or you remove the entry directly from it:
def deletefromDB(eventID):
query = 'DELETE FROM Events WHERE EventId = {}'.format(eventID)
with conn:
cursor.execute(query)
del data[eventID]
print(query)
initDB()
Be careful, you create a formatted SQL string without escaping the parameter (SQL Injection alert). Use the cursor execute to substitute the variables.
query = 'DELETE FROM Events WHERE EventId = %d'
# ...
cursor.execute(query, (eventID,))
Do it whenever you substitute something into a SQL query!

Categories

Resources