I want to write a pandas dataframe to a postgres table. I make a connection to db as follows:
import psycopg2
import pandas as pd
import sqlalchemy
def connect(user, password, db, host='localhost', port=5432):
'''Returns a connection and a metadata object'''
url = 'postgresql://{}:{}#{}:{}/{}'
url = url.format(user, password, host, port, db)
# The return value of create_engine() is our connection object
con = sqlalchemy.create_engine(url, client_encoding='utf8')
# We then bind the connection to MetaData()
meta = sqlalchemy.MetaData(bind=con, reflect=True)
return con, meta
con, meta = connect('user_name', 'password', 'db_name', host='host_name')
When I read from a table that is already populated, it works fine:
df = pd.read_sql("SELECT * FROM db.table_name limit 10",con=con)
print df
I would like to be able to write df to a table. To test this, I have a temporary table called 'test' with two fields name and age.
# create a temp df
table = [['name', 'age'], ['nameA' , 20], ['nameB', 30]]
headers = table.pop(0)
df = pd.DataFrame(table, columns=headers)
# write to db
df.to_sql('db.test', con, if_exists = 'replace', index=False)
I then check if the temp table is populated:
df = pd.read_sql("SELECT * FROM db.test limit 10",con=con)
print df
I get an empty dataframe! I got no errors when I use df.to_sql but nothing is getting written to the database (?). What am I missing and how do I go about fixing this?
Versions:
Pandas: 0.19.2
Sqlachemy: 1.1.10
Postgres: 9.4.9
I have not figured out why df.to_sql did not write to the table. Writing to table using pd.io.sql.SQLDatabase worked for my test case:
meta = sqlalchemy.MetaData(con, schema='db_name')
meta.reflect()
pdsql = pd.io.sql.SQLDatabase(con, meta=meta)
pdsql.to_sql(df, 'test', if_exists='replace')
I would not consider this THE solution -- I'd be happy to accept better solution or an answer that brings a closure to why df.to_sql() does not behave as expected.
Related
I am working with python trying to connect with postgres, I created a table into my postgres database in the staging schema.
create table staging.data( Name varchar, Age bigint);
then I try to connect and insert my dataframe data into this table:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
conn_string = 'postgresql://myuser:password#host/database_name'
db = create_engine(conn_string)
conn = db.connect()
# our dataframe
data = {'Name': ['Tom', 'dick', 'harry'],
'Age': [22, 21, 24]}
# Create DataFrame
df = pd.DataFrame(data)
df.to_sql('staging.data', con=conn, if_exists='replace',
index=False)
conn = psycopg2.connect(conn_string
)
conn.autocommit = True
cursor = conn.cursor()
sql1 = '''select * from staging.data;'''
cursor.execute(sql1)
for i in cursor.fetchall():
print(i)
conn.commit()
conn.close()
But the Python ends with no error message, and there is no data into my table from postgres.
Any idea about this?
Regards
I think the issue is that you are trying to use a schema other than public. Try passing in the schema name via the schema argument of to_sql() like this:
df.to_sql('data', con=conn, if_exists='replace', schema='staging', index=False)
So I am fairly new to flask and I am currently trying to create a flask api for a project I am working on. However, there are a couple of issues I am facing.
So for my 1st issue, I can't get my dataframe from the 1st function to work in my second function. I am just wondering how I can get the data_1 to work in the second function.
Code:
from flask import Flask
from sqlalchemy import create_engine
import sqlite3 as sql
import pandas as pd
import datetime
import os
app = Flask(__name__)
#app.route('/', methods=['GET'])
def get_data():
...
data_1 = ...
#print(data_1.head(n=10))
return "hello"
#app.route('/table1', methods=['GET'])
def store_table1_data_df():
db_path = os.path.join(os.path.dirname(__file__),'table1.db')
engine = create_engine('sqlite:///{}'.format(db_path), echo=True)
sqlite_connection = engine.connect()
sqlite_table = 'table1'
data_1.to_sql(sqlite_table,sqlite_connection, if_exists='append')
sqlite_connection.close()
return "table1"
For my second issue, is there a better way of storing a dataframe within flask api using sqlalchemy or sqlite3?
More context as to what kind of data_1 is: data_1 can only hold the past 15 days/records like from 6/15/2021-6/30/2021. However, tomorrow, if I fetch the newest data_1 it will contain 6/16/2021-7/01/2021. How can I just append 07/01/2021 to the old data_1 without creating duplicate records from 06/16/2021, creating two more functions, and an extra db file?
#app.route('/table1', methods=['GET'])
def store_table1_data_df():
db_path = os.path.join(os.path.dirname(__file__),'table1.db')
engine = create_engine('sqlite:///{}'.format(db_path), echo=True)
sqlite_connection = engine.connect()
sqlite_table = 'table1'
data_1.to_sql(sqlite_table,sqlite_connection, if_exists='append')
sqlite_connection.close()
return "table1"
#app.route('/table2', methods=['GET'])
def store_table2_data_df():
db_path2 = os.path.join(os.path.dirname(__file__),'table2.db')
engine2 = create_engine('sqlite:///{}'.format(db_path2), echo=True)
sqlite_connection2 = engine2.connect()
sqlite_table2 = 'table2'
data_1.to_sql(sqlite_table2,sqlite_connection2, if_exists='append')
sqlite_connection2.close()
return "table2"
# What I probably have down below is not the correct way to solve this problem
#app.route('/table1', methods=['GET'])
conn = sql.connect("table1.db")
cur = conn.cursor()
#cur.execute
cur.execute("ATTACH 'table2.db' as 'table2' ")
conn.commit()
table_3 = pd.read_sql_query("SELECT DISTINCT date, value FROM table1 UNION SELECT DISTINCT date, value from table2 ORDER BY date", conn)
cur.execcute("SELECT DISTINCT date, value FROM table1 UNION SELECT DISTINCT date, value from table2 ORDER BY date")
conn.commit()
results3 = cur.fetchall()
sqlite_table='table1'
table_3.to_sql(sqlite_table, conn, if_exists='replace')
cur.close()
conn.close()
return "work"
Any help is greatly appreciated.
For your 1st problem. You may do either of these:
If the size of data-1 is small(than 200kb) you may use flask-session to store the data and access it across routes.
You create a function that returns data_1. Call that function in any route you want. Hint:
def getdata1(val1, val2):
#calculation here
return data_1
Just call this wherever you need data_1.
Store the data frame in a DB and fetch it.
For the second part, a simple for loop will work. Hint on that:
sql_table = ["Fetch your sql table here with the dataframe. Considering dates in one column"]
data_1 = ["Your dataframe"]
for i in data_1['Dates']:
if i != sql_table['dates']:
#insert this key:value pair in sql table
If your data frame and sql is getting loaded in order by date, even better. You just need to check the last elements of each.
Ok, I have tried several kinds of solutions recommended by others on this site and other sited. However, I can't get it work as I would like it to do.
I get a XML-response which I normalize and then save to a CSV. This first part works fine.
Instead of saving it to CSV I would like to save it into an existing table in an access database. The second part below:
Would like to use an existing table instead of creating a new one
The result is not separated with ";" into different columns. Everything ends up in the same column not separated, see image below
response = requests.get(u,headers=h).json()
dp = pd.json_normalize(response,'Units')
response_list.append(dp)
export = pd.concat(response_list)
export.to_csv(r'C:\Users\username\Documents\Python Scripts\Test\Test2_'+str(now)+'.csv', index=False, sep=';',encoding='utf-8')
access_path = r"C:\Users\username\Documents\Python Scripts\Test\Test_db.accdb"
conn = pyodbc.connect("DRIVER={{Microsoft Access Driver (*.mdb, *.accdb)}};DBQ={};" \
.format(access_path))
strSQL = "SELECT * INTO projects2 FROM [text;HDR=Yes;FMT=sep(;);" + \
"Database=C:\\Users\\username\\Documents\\Python Scripts\\Test].Testdata.csv;"
cur = conn.cursor()
cur.execute(strSQL)
conn.commit()
conn.close()
If you already have the data in a well-formed pandas DataFrame then you don't really need to dump it to a CSV file; you can use the sqlalchemy-access dialect to push the data directly into an Access table using pandas' to_sql() method:
from pprint import pprint
import urllib
import pandas as pd
import sqlalchemy as sa
connection_string = (
r"DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};"
r"DBQ=C:\Users\Public\Database1.accdb;"
r"ExtendedAnsiSQL=1;"
)
connection_uri = f"access+pyodbc:///?odbc_connect={urllib.parse.quote_plus(connection_string)}"
engine = sa.create_engine(connection_uri)
with engine.begin() as conn:
# existing data in table
pprint(
conn.execute(sa.text("SELECT * FROM user_table")).fetchall(), width=30
)
"""
[('gord', 'gord#example.com'),
('jennifer', 'jennifer#example.com')]
"""
# DataFrame to insert
df = pd.DataFrame(
[
("newdev", "newdev#example.com"),
("newerdev", "newerdev#example.com"),
],
columns=["username", "email"],
)
df.to_sql("user_table", engine, index=False, if_exists="append")
with engine.begin() as conn:
# updated table
pprint(
conn.execute(sa.text("SELECT * FROM user_table")).fetchall(), width=30
)
"""
[('gord', 'gord#example.com'),
('jennifer', 'jennifer#example.com'),
('newdev', 'newdev#example.com'),
('newerdev', 'newerdev#example.com')]
"""
(Disclosure: I am currently the maintainer of the sqlalchemy-access dialect.)
Solved with the following code
SE_export_Tuple = list(zip(SE_export.Name,SE_export.URL,SE_export.ImageUrl,......,SE_export.ID))
print(SE_export_Tuple)
access_path = r"C:\Users\username\Documents\Python Scripts\Test\Test_db.accdb"
conn = pyodbc.connect("DRIVER={{Microsoft Access Driver (*.mdb, *.accdb)}};DBQ={};" \
.format(access_path))
cursor = conn.cursor()
mySql_insert_query="INSERT INTO Temp_table (UnitName,URL,ImageUrl,.......,ID) VALUES (?,?,?,......,?)"
cursor.executemany(mySql_insert_query,SE_export_Tuple)
conn.commit()
conn.close()
However, when I add many fields I get an error at "executemany", saying:
cursor.executemany(mySql_insert_query,SE_export_Tuple)
Error: ('HY004', '[HY004] [Microsoft][ODBC Microsoft Access Driver]Invalid SQL data type (67) (SQLBindParameter)')
I created a table inserting data fetched from an api and store in to a pandas dataframe using sqlalchemy.
I am gonna need to query the api, every 4 hours, to get new data.
Problem being that the api, will give me back not only the new data but as well the old ones, already imported in mysql
how can i import just the new data into the mysql table
i retrieved the data from the api, stored the data in to a pandas object, created the connection to the mysql db and created a fresh new table.
import requests
import json
from pandas.io.json import json_normalize
myToken = 'xxx'
myUrl = 'somewebsite'
head = {'Authorization': 'token {}'.format(myToken)}
response = requests.get(myUrl, headers=head)
data=response.json()
#print(data.dumps(data, indent=4, sort_keys=True))
results=json_normalize(data['results'])
results.rename(columns={'datastream.name': 'datastream_name',
'datastream.url':'datastream_url',
'datastream.datastream_type_id':'datastream_id',
'start':'error_date'}, inplace=True)
results_final=pd.DataFrame([results.datastream_name,
results.datastream_url,
results.error_date,
results.datastream_id,
results.message,
results.type_label]).transpose()
from sqlalchemy import create_engine
from sqlalchemy import exc
engine = create_engine('mysql://usr:psw#ip/schema')
con = engine.connect()
results_final.to_sql(name='error',con=con,if_exists='replace')
con.close()
End goal is to insert into the table, just the not existing data coming from the api
You could pull the results already in the database into a new dataframe and then compare the two dataframes. After that you would only insert the rows not in the table. Not knowing the format of your table or data I'm just using a generic SELECT statement here.
from sqlalchemy import create_engine
from sqlalchemy import exc
engine = create_engine('mysql://usr:psw#ip/schema')
con = engine.connect()
sql = "SELECT * FROM table_name"
old_results = pd.read_sql(sql, con)
df = pd.merge(old_results, results_final, how='outer', indicator=True)
new_results = df[df['_merge']=='right_only'][results_final.columns]
new_results.to_sql(name='error',con=con,if_exists='append')
con.close()
You also need to change if_exists to append because set to replace it drops all values in the table and replaces them with the values in the pandas dataframe.
I developed this function to handle both: news values and when columns from the source table and target table are not equal.
def load_data(df):
engine = create_engine('mysql+pymysql://root:pass#localhost/dw', echo_pool=True, pool_size=10, max_overflow=20)
with engine.connect() as conn, conn.begin():
try:
df_old = pd.read_sql('SELECT * FROM table', conn)
# Check if exists new rows to be inserted
if len(df) > len(df_saved) or df.disconnected_time.max() > df_saved.disconnected_time.max():
print("There are new rows to be inserted. ")
df_merged = pd.merge(df_old, df, how='outer', indicator=True)
df_final = df_merged[df_merged['_merge']=='right_only'][df.columns]
df_final.to_sql(name='table',con=conn,index=False, if_exists='append')
except Exception as err:
print (str(err))
else:
# This handling errors when the lengths of the columns are not equal to the target
if df_bulbr.shape[1] > df_old.shape[1]:
data = pd.read_sql('SELECT * FROM table', conn)
df2 = pd.concat([df,data])
df2.to_sql('table', conn, index=False, if_exists='replace')
outcome = conn.execute("select count(1) from table")
countRow = outcome.first()[0]
return print(f" Total of {countRow} rows load." )
I'd like to write a Pandas dataframe to PostgreSQL table without using SQLAlchemy.
The table name should correspond to the pandas variable name, or replace the table if already exists. Data types need to match as well.
I'd like to avoid SQLAlchemy's to_sql function for several reasons.
import pandas as pd
from getpass import getpass
import psycopg2
your_pass = getpass(prompt='Password: ', stream=None)
conn_cred = {
'host': your_host,
'port': your_port,
'dbname': your_dbname,
'user': your_user,
'password': your_pass
}
conn = psycopg2.connect(**conn_cred)
conn.autocommit = True
my_data = {'col1': [1, 2], 'col2': [3, 4]}
def store_dataframe_to_postgre(df, schema, active_conn):
# df = pandas dataframe to store as a table
# schema = schema for the table
# active_conn = open connection to a PostgreSQL db
# ...
# Bonus: require explicit commit here, even though conn.autocommit = True
store_dataframe_to_postgre(my_data, 'my_schema', conn)
This should be the result in the Postgre db:
SELECT * FROM my_schema.my_data;
col1 col2
1 3
2 4
you can try but this code in your:
cursor = conn.cursor()
cur.copy_from(df, schema , null='', sep=',', columns=(my_data))
reference code:
copy dataframe to postgres table with column that has defalut value