Inserting selective columns to postgres using pandas python - python

Objective is to write excel column data into postgres table.But all the columns names in excel doesn't match with the table column.
So what I am doing is trying to insert only the common columns.
I am able to get the common data in the set.
I am stuck as to how to insert the data in a single query.
I am using pandas dataframe.
#Getting table columns in a list
conn = psycopg2.connect(dbname=dbname, host=host, port=port, user=user, password=pwd)
print("Connecting to Database")
cur = conn.cursor()
cur.execute("SELECT * FROM " + table_name + " LIMIT 0")
table_columns = [desc[0] for desc in cur.description]
#print table_columns
#Getting excel sheet columns in a list
df = pd.read_excel('/Users/.../plans.xlsx', sheet_name='plans')
engine = create_engine('postgresql://postgres:postgres#localhost:5432/test_db')
column_list = df.columns.values.tolist()
#print(column_list)
s = set(column_list).intersection(set(table_columns))
for x in df['column_1'] :
sql = "insert into test_table(column_1) values ('" + x + "')"
cur.execute(sql)
cur.execute("commit;")
conn.close()
update the code based on the answers but with this every time i run the program it inserts new records. Is there any option where I can just do the update.
s = set(column_list).intersection(set(table_columns))
df1 = df[df.columns.intersection(table_columns)]
#print df1
df1.to_sql('medical_plans', con=engine, if_exists='append', index=False, index_label=None)

Related

More efficient way to query this SQL table from python?

I need to query rows where a column matches my list of ~60K IDs out of a table that contains millions of IDs. I think normally you would insert a temporary table into the database and merge on that but I can't edit this database. I am doing it like this using a loop w/ a python wrapper, but is there a better way? I mean it works, but still:
import pyodbc
import pandas as pd
# connect to the database using windows authentication
conn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=my_fav_server;DATABASE=my_fav_db;Trusted_Connection=yes;')
cursor = conn.cursor()
# read in all the ids
ids_list = [...60K ids in here..]
# query in 10K chunks to prevent memory error
def chunks(l,n):
# split list into n lists of evenish size
n = max(1,n)
return [l[i:i+n] for i in range(0,len(l), n)]
chunked_ids_lists = chunks(ids_list, 10000)
# looping through to retrieve all cols
for chunk_num, chunked_ids_list in enumerate(chunked_ids_lists):
temp_ids_string = "('" + "','".join(chunked_ids_list) + "')"
temp_sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN {temp_ids_string};"
temp_data = pd.read_sql_query(temp_sql, conn)
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data.to_csv(temp_path, sep='\t', index=None)
# read the query chunks
all_data_list = []
for chunk_num in range(len(chunked_ids_lists)):
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data = pd.read_csv(temp_path, sep='\t')
all_data_list.append(temp_data)
all_data = pd.concat(all_data_list)
Another way use Psycopg's cursor.
import psycopg2
# Connect to an existing database
conn = psycopg2.connect("dbname=test user=postgres")
# Open a cursor to perform database operations
cur = conn.cursor()
# get data from query
# no need construct 'SQL-correct syntax' filter
cur.execute("SELECT * FROM dbo.my_fav_table WHERE ID IN %(filter)s;", {"filter": chunked_ids_lists})
# loop over getted rows
for record in cur:
# we got one record
print(record) # or make other data treatment
Use parameters rather than concatenating strings.
I don't see the need for the CSV files, if you're just going to read them all into Python in the next loop. Just put everything into all_data_list during the query loop.
all_data_list = []
for chunk in chunked_ids_lists:
params = ','.join(['?'] * len(chunk))
sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN ({params});"
cursor.execute(sql, chunk)
rows = cursor.fetchall()
all_data_list.extend(rows)
all_data = pd.dataFrame(all_data_list)

Dropping some data while saving dataframe into csv file

I am running redshift query which is having 40 millions of record. But when I am saving into csv file it is showing only 7 thousands of record. Could you please help me how to solve this?
Example:
Code:
conn = gcso_conn1()
with conn.cursor() as cur:
query = "select * from (select a.src_nm Source_System ,b.day_id Date,b.qty Market_Volume,b.cntng_unt Volume_Units,b.sls_in_lcl_crncy Market_Value,b.crncy_cd Value_Currency,a.panel Sales_Channel,a.cmpny Competitor_Name,a.lcl_mnfcr Local_Manufacturer ,a.src_systm_id SKU_PackID_ProductNumber,upper(a.mol_list) Molecule_Name,a.brnd_nm BrandName_Intl,a.lcl_prod_nm BrandName_Local,d.atc3_desc Brand_Indication,a.prsd_strngth_1_nbr Strength,a.prsd_strngth_1_unt Strength_Units,a.pck_desc Pack_Size_Number,a.prod_nm Product_Description,c.iso3_cntry_cd Country_ISO_Code,c.cntry_nm Country_Name from gcso_prd_cpy.dim_prod a join gcso_prd_cpy.fct_sales b on (a.SRC_NM='IMS' and b.SRC_NM='IMS' and a.prod_id = b.prod_id) join gcso_prd_cpy.dim_cntry c on (a.cntry_id = c.cntry_id) left outer join gcso_prd_cpy.dim_thrc_area d on (a.prod_id = d.prod_id) WHERE a.SRC_NM='IMS' and c.iso3_cntry_cd in ('JPN','IND','CAN','USA') and upper(a.mol_list) in ('AMBRISENTAN', 'BERAPROST','BOSENTAN') ORDER BY b.day_id ) a"
#print(query)
cur.execute(query)
result = cur.fetchall()
conn.commit()
column = [i[0] for i in cur.description]
sqldf = pd.DataFrame(result, columns= column)
print(sqldf.count())
#print(df3)
sqldf.to_csv(Output_Path, index= False, sep= '\001', encoding = 'utf-8')
Everything should work correctly. I think the main problem is debugging using count(). You expect number of records but docs says:
Count non-NA cells for each column or row.
Better to use when debugging DataFrame:
print(len(df))
print(df.shape)
print(df.info())
Also you can do it easier using read_sql:
import pandas as pd
from sqlalchemy import create_engine
header = True
for chunk in pd.read_sql(
'your query here - SELECT * FROM... ',
con=create_engine('creds', echo=True), # set creds - postgres+psycopg2://user:password#host:5432/db_name
chunksize=1000, # read by chunks
):
file_path = '/tmp/path_to_your.csv'
chunk.to_csv(
file_path,
header=header,
mode='a',
index=False,
)
header = False

How to conduct SQL queries on multiple .db files and store the results in a .csv?

I have about 100 .db files stored on my Google Drive which I want to run the same SQL query on. I'd like to store these query results in a single .csv file.
I've managed to use the following code to write the results of a single SQL query into a .csv file, but I am unable to make it work for multiple files.
conn = sqlite3.connect('/content/drive/My Drive/Data/month_2014_01.db')
df = pd.read_sql_query("SELECT * FROM messages INNER JOIN users ON messages.id = users.id WHERE text LIKE '%house%'", conn)
df.to_csv('/content/drive/My Drive/Data/Query_Results.csv')
This is the code that I have used so far to try and make it work for all files, based on this post.
databases = []
directory = '/content/drive/My Drive/Data/'
for filename in os.listdir(directory):
flname = os.path.join(directory, filename)
databases.append(flname)
for database in databases:
try:
with sqlite3.connect(database) as conn:
conn.text_factory = str
cur = conn.cursor()
cur.execute(row["SELECT * FROM messages INNER JOIN users ON messages.id = users.id WHERE text LIKE '%house%'"])
df.loc[index,'Results'] = cur.fetchall()
except sqlite3.Error as err:
print ("[INFO] %s" % err)
But this throws me an error: TypeError: tuple indices must be integers or slices, not str.
I'm obviously doing something wrong and I would much appreciate any tips that would point towards an answer.
Consider building a list of data frames, then concatenate them together in a single data frame with pandas.concat:
gdrive = "/content/drive/My Drive/Data/"
sql = """SELECT * FROM messages
INNER JOIN users ON messages.id = users.id
WHERE text LIKE '%house%'
"""
def build_df(db)
with sqlite3.connect(os.path.join(gdrive, db)) as conn:
df = pd.read_sql_query(sql, conn)
return df
# BUILD LIST OF DFs WITH LIST COMPREHENSION
df_list = [build_df(db) for db in os.listdir(gdrive) if db.endswith('.db')]
# CONCATENATE ALL DFs INTO SINGLE DF FOR EXPORT
final_df = pd.concat(df_list, ignore_index = True)
final_df.to_csv(os.path.join(gdrive, 'Query_Results.csv'), index = False)
Better yet, consider SQLite's ATTACH DATABASE and append query results into a master table. This also avoids using the heavy data science, third-party library, pandas, for simple data migration needs. Plus, you keep all database data inside SQLite without worrying about data type conversion and i/o transfer issues.
import csv
import sqlite3
with sqlite3.connect(os.path.join(gdrive, 'month_2014_01')) as conn:
# CREATE MASTER TABLE
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS master_query")
cur.execute("""CREATE TABLE master_query AS
SELECT * FROM tmp.messages
INNER JOIN tmp.users
ON tmp.messages.id = tmp.users.id
WHERE text LIKE '%house%'
""")
conn.commit()
# ITERATIVELY ATTACH AND APPEND RESULTS
for db in os.listdir(gdrive):
if db.endswith('.db'):
cur.execute("ATTACH DATABASE ? AS tmp", [db])
cur.execute("""INSERT INTO master_query
SELECT * FROM tmp.messages
INNER JOIN tmp.users
ON tmp.messages.id = tmp.users.id
WHERE text LIKE '%house%'
""")
cur.execute("DETACH DATABASE tmp")
conn.commit()
# WRITE TUPLE OF ROWS TO CSV
data = cur.execute("SELECT * FROM master_query")
with open(os.path.join(gdrive, 'Query_Results.csv'), 'wb') as f:
writer = csv.writer(f)
writer.writerow([i[0] for i in cur.description]) # HEADERS
writer.writerows(data) # DATA
cur.close()

Does pandas support reading data from multiple tables into a dataframe?

I'm using pandas to read SQLl output into a dataframe. I'm calling a stored procedure which returns a table output. Following code works fine.If my stored procedure return more than one table outputs[1], How can I read those from dataframe. I want to write different table outputs into different excel sheets
query='exec [aa].[dbo].[sp_cc]?,?'
df = pd.read_sql(query, cnxn, params=[start,end)
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, index=False, sheet_name='customers')
writer.save()
[1]
CREATE procedure [dbo].[usp_vvvv] (....)
BEGIN
SET NOCOUNT ON
.....
select *
FROM #_temp_client_details
select *
FROM #_temp_address_details
select *
FROM #_temp_invoice_details
drop table #_temp_client_details
drop table #_temp_address_details
drop table #_temp_invoice_details
....
END TRY
BEGIN CATCH
..
END CATCH
END
I hope this can help you :
import pandas as pd
import pyodbc
conn = pyodbc.connect('driver={SQL Server};server=xxx.xxx.x.xxx;uid=myuser;pwd=mypass;database=mybd;autocommit=True')
cursor = conn.cursor()
cursor.execute('exec usp_with_2_select')
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
column_names = [col[0] for col in cursor.description]
df1_data = []
for row in cursor.fetchall():
df1_data.append({name: row[i] for i, name in enumerate(column_names)})
df1 = pd.DataFrame(df1_data)
print(df1)
df1.to_excel(writer,'sheet1')
# this for pass the next result
cursor.nextset ()
df2_data = []
for row in cursor.fetchall():
df2_data.append({name: row[i] for i, name in enumerate(column_names)})
df2 = pd.DataFrame(df2_data)
print(df2)
df2.to_excel(writer,'sheet2')
writer.save()
Why do you need Pandas for this? You can go from SQL Server directly to Excel many different ways. Here is one concept that will work for you. There are many ways to skin this cat...
Sub ADOExcelSQLServer()
' Carl SQL Server Connection
'
' FOR THIS CODE TO WORK
' In VBE you need to go Tools References and check Microsoft Active X Data Objects 2.x library
'
Dim Cn As ADODB.Connection
Dim Server_Name As String
Dim Database_Name As String
Dim User_ID As String
Dim Password As String
Dim SQLStr As String
Dim rs As ADODB.Recordset
Set rs = New ADODB.Recordset
Server_Name = "your_server_name" ' Enter your server name here
Database_Name = "NORTHWND" ' Enter your database name here
User_ID = "" ' enter your user ID here
Password = "" ' Enter your password here
SQLStr = "SELECT * FROM [Customers]" ' Enter your SQL here
Set Cn = New ADODB.Connection
Cn.Open "Driver={SQL Server};Server=" & Server_Name & ";Database=" & Database_Name & _
";Uid=" & User_ID & ";Pwd=" & Password & ";"
rs.Open SQLStr, Cn, adOpenStatic
' Dump to spreadsheet
For iCols = 0 To rs.Fields.Count - 1
Worksheets("Sheet1").Cells(1, iCols + 1).Value = rs.Fields(iCols).Name
Next
With Worksheets("sheet1").Range("a2:z500") ' Enter your sheet name and range here
'.ClearContents
.CopyFromRecordset rs
End With
' Tidy up
rs.Close
Set rs = Nothing
Cn.Close
Set Cn = Nothing
End Sub

INSERT IGNORE don't work on my script

I have this python script who inserts xcell data into a mysql db, but I need to only inserts rows that aren't duplicates, so I used the INSERT IGNORE sql method also tryed with ON DUPLICATE KEY UPDATE, but it doesn't work, it just insert all the data that is in the table. This is the script:
import xlrd
import MySQLdb
#Seting the database connection
database = MySQLdb.connect (host="localhost", user = "root", passwd = "****", db = "python_insert")
cursor = database.cursor()
query = """INSERT IGNORE INTO test (masina,data_ora,conbustibil) VALUES (%s,%s,%s)"""
#Open and parse the xcell file
book = xlrd.open_workbook("asset/testing.xlsx")
sheet = book.sheet_by_name("Report")
for r in range(1, sheet.nrows):
masina = sheet.cell(r, 1).value
data_ora = sheet.cell(r, 0).value
conbustibil = sheet.cell(r, 8).value
values = (masina, data_ora, conbustibil)
cursor.execute(query, values)
#Closing cursor+database
cursor.close()
database.commit()
database.close()
print "Succes"
columns = str(sheet.ncols)
rows = str(sheet.nrows)
print ("Am importat " + columns + " coloane si " + rows + " randuri in MySQL")

Categories

Resources