I need to query rows where a column matches my list of ~60K IDs out of a table that contains millions of IDs. I think normally you would insert a temporary table into the database and merge on that but I can't edit this database. I am doing it like this using a loop w/ a python wrapper, but is there a better way? I mean it works, but still:
import pyodbc
import pandas as pd
# connect to the database using windows authentication
conn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=my_fav_server;DATABASE=my_fav_db;Trusted_Connection=yes;')
cursor = conn.cursor()
# read in all the ids
ids_list = [...60K ids in here..]
# query in 10K chunks to prevent memory error
def chunks(l,n):
# split list into n lists of evenish size
n = max(1,n)
return [l[i:i+n] for i in range(0,len(l), n)]
chunked_ids_lists = chunks(ids_list, 10000)
# looping through to retrieve all cols
for chunk_num, chunked_ids_list in enumerate(chunked_ids_lists):
temp_ids_string = "('" + "','".join(chunked_ids_list) + "')"
temp_sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN {temp_ids_string};"
temp_data = pd.read_sql_query(temp_sql, conn)
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data.to_csv(temp_path, sep='\t', index=None)
# read the query chunks
all_data_list = []
for chunk_num in range(len(chunked_ids_lists)):
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data = pd.read_csv(temp_path, sep='\t')
all_data_list.append(temp_data)
all_data = pd.concat(all_data_list)
Another way use Psycopg's cursor.
import psycopg2
# Connect to an existing database
conn = psycopg2.connect("dbname=test user=postgres")
# Open a cursor to perform database operations
cur = conn.cursor()
# get data from query
# no need construct 'SQL-correct syntax' filter
cur.execute("SELECT * FROM dbo.my_fav_table WHERE ID IN %(filter)s;", {"filter": chunked_ids_lists})
# loop over getted rows
for record in cur:
# we got one record
print(record) # or make other data treatment
Use parameters rather than concatenating strings.
I don't see the need for the CSV files, if you're just going to read them all into Python in the next loop. Just put everything into all_data_list during the query loop.
all_data_list = []
for chunk in chunked_ids_lists:
params = ','.join(['?'] * len(chunk))
sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN ({params});"
cursor.execute(sql, chunk)
rows = cursor.fetchall()
all_data_list.extend(rows)
all_data = pd.dataFrame(all_data_list)
Related
I am trying to execute a select query using python , which has over 200,000 results/rows. The same query which outputs 200,000 rows takes around 10-15 secs on the DB_ENGINE(Dbeaver) but while trying to execute it using python, it takes around 6 minutes. I am using psycopg2, SQLALCHEMY engine.
engine = create_engine(SQLALCHEMY_DATABASE_URI, poolclass=NullPool)
conn = engine.raw_connection()
cursor = conn.cursor()
cursor.execute(query)
Can anyone suggest ways to optimise it to the level of Dbeaver output's time.
Other ways tried:
Part 1:
df = pd.DataFrame()
dfl = []
for chunk in pd.read_sql_query( query, con=engine ,chunksize=5000):
print('chunk from PSQL', chunk)
# Start Appending Data Chunks from PSQL Result set into List
dfl.append(chunk)
# Start appending data from list to dataframe
df = pd.concat(dfl, ignore_index=True)
Part 2:
with tempfile.TemporaryFile() as tmpfile:
copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format(
query=query, head="HEADER"
)
conn = engine.raw_connection()
cur = conn.cursor()
cur.copy_expert(copy_sql, tmpfile)
tmpfile.seek(0)
df = pd.read_csv(tmpfile)
tmpfile.close()
I have about 100 .db files stored on my Google Drive which I want to run the same SQL query on. I'd like to store these query results in a single .csv file.
I've managed to use the following code to write the results of a single SQL query into a .csv file, but I am unable to make it work for multiple files.
conn = sqlite3.connect('/content/drive/My Drive/Data/month_2014_01.db')
df = pd.read_sql_query("SELECT * FROM messages INNER JOIN users ON messages.id = users.id WHERE text LIKE '%house%'", conn)
df.to_csv('/content/drive/My Drive/Data/Query_Results.csv')
This is the code that I have used so far to try and make it work for all files, based on this post.
databases = []
directory = '/content/drive/My Drive/Data/'
for filename in os.listdir(directory):
flname = os.path.join(directory, filename)
databases.append(flname)
for database in databases:
try:
with sqlite3.connect(database) as conn:
conn.text_factory = str
cur = conn.cursor()
cur.execute(row["SELECT * FROM messages INNER JOIN users ON messages.id = users.id WHERE text LIKE '%house%'"])
df.loc[index,'Results'] = cur.fetchall()
except sqlite3.Error as err:
print ("[INFO] %s" % err)
But this throws me an error: TypeError: tuple indices must be integers or slices, not str.
I'm obviously doing something wrong and I would much appreciate any tips that would point towards an answer.
Consider building a list of data frames, then concatenate them together in a single data frame with pandas.concat:
gdrive = "/content/drive/My Drive/Data/"
sql = """SELECT * FROM messages
INNER JOIN users ON messages.id = users.id
WHERE text LIKE '%house%'
"""
def build_df(db)
with sqlite3.connect(os.path.join(gdrive, db)) as conn:
df = pd.read_sql_query(sql, conn)
return df
# BUILD LIST OF DFs WITH LIST COMPREHENSION
df_list = [build_df(db) for db in os.listdir(gdrive) if db.endswith('.db')]
# CONCATENATE ALL DFs INTO SINGLE DF FOR EXPORT
final_df = pd.concat(df_list, ignore_index = True)
final_df.to_csv(os.path.join(gdrive, 'Query_Results.csv'), index = False)
Better yet, consider SQLite's ATTACH DATABASE and append query results into a master table. This also avoids using the heavy data science, third-party library, pandas, for simple data migration needs. Plus, you keep all database data inside SQLite without worrying about data type conversion and i/o transfer issues.
import csv
import sqlite3
with sqlite3.connect(os.path.join(gdrive, 'month_2014_01')) as conn:
# CREATE MASTER TABLE
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS master_query")
cur.execute("""CREATE TABLE master_query AS
SELECT * FROM tmp.messages
INNER JOIN tmp.users
ON tmp.messages.id = tmp.users.id
WHERE text LIKE '%house%'
""")
conn.commit()
# ITERATIVELY ATTACH AND APPEND RESULTS
for db in os.listdir(gdrive):
if db.endswith('.db'):
cur.execute("ATTACH DATABASE ? AS tmp", [db])
cur.execute("""INSERT INTO master_query
SELECT * FROM tmp.messages
INNER JOIN tmp.users
ON tmp.messages.id = tmp.users.id
WHERE text LIKE '%house%'
""")
cur.execute("DETACH DATABASE tmp")
conn.commit()
# WRITE TUPLE OF ROWS TO CSV
data = cur.execute("SELECT * FROM master_query")
with open(os.path.join(gdrive, 'Query_Results.csv'), 'wb') as f:
writer = csv.writer(f)
writer.writerow([i[0] for i in cur.description]) # HEADERS
writer.writerows(data) # DATA
cur.close()
I am trying to execute the following script. but I don't get neither the desired results nor a error message ,and I can't figure out where I'm doing wrong.
import pyodbc
cnxn = pyodbc.connect("Driver={SQL Server Native Client 11.0};"
"Server=mySRVERNAME;"
"Database=MYDB;"
"uid=sa;pwd=MYPWD;"
"Trusted_Connection=yes;")
cursor = cnxn.cursor()
cursor.execute('select DISTINCT firstname,lastname,coalesce(middlename,\' \') as middlename from Person.Person')
for row in cursor:
print('row = %r' % (row,))
any ideas ? any help is appreciated :)
You have to use a fetch method along with cursor. For Example
for row in cursor.fetchall():
print('row = %r' % (row,))
EDIT :
The fetchall function returns all remaining rows in a list.
If there are no rows, an empty list is returned.
If there are a lot of rows, *this will use a lot of memory.*
Unread rows are stored by the database driver in a compact format and are often sent in batches from the database server.
Reading in only the rows you need at one time will save a lot of memory.
If we are going to process the rows one at a time, we can use the cursor itself as an interator
Moreover we can simplify it since cursor.execute() always returns a cursor :
for row in cursor.execute("select bla, anotherbla from blabla"):
print row.bla, row.anotherbla
Documentation
I found this information useful to retrieve data from SQL database to python as a data frame.
import pandas as pd
import pymssql
con = pymssql.connect(server='use-et-aiml-cloudforte-aiops- db.database.windows.net',user='login_username',password='login_password',database='database_name')
cursor = con.cursor()
query = "SELECT * FROM <TABLE_NAME>"
cursor.execute(query)
df = pd.read_sql(query, con)
con.close()
df
import mysql.connector as mc
connection creation
conn = mc.connect(host='localhost', user='root', passwd='password')
print(conn)
#create cursor object
cur = conn.cursor()
print(cur)
cur.execute('show databases')
for i in cur:
print(i)
query = "Select * from employee_performance.employ_mod_recent"
emp_data = pd.read_sql(query, conn)
emp_data
I have a folder called 'testfolder' that includes two files -- 'Sigurdlogfile' and '2004ADlogfile'. Each file has a list of strings called entries. I need to run my code on both of them and am using glob to do this. My code creates a dictionary for each file and stores data extracted using regex where the dictionary keys are stored in commonterms below. Then it inserts each dictionary into a mysql table. It does all of this successfully, but my second sql statement is not inserting how it should (per file).
import glob
import re
files = glob.glob('/home/user/testfolder/*logfile*')
commonterms = (["freq", "\s?(\d+e?\d*)\s?"],
["tx", "#txpattern"],
["rx", "#rxpattern"], ...)
terms = [commonterms[i][0] for i in range(len(commonterms))]
patterns = [commonterms[i][1] for i in range(len(commonterms))]
def getTerms(entry):
for i in range(len(terms)):
term = re.search(patterns[i], entry)
if term:
term = term.groups()[0] if term.groups()[0] is not None else term.groups()[1]
else:
term = 'NULL'
d[terms[i]] += [term]
return d
for filename in files:
#code to create 'entries'
objkey = re.match(r'/home/user/testfolder/(.+?)logfile', filename).group(1)
d = {t: [] for t in terms}
for entry in entries:
d = getTerms(entry)
import MySQLdb
db = MySQLdb.connect(host='', user='', passwd='', db='')
cursor = db.cursor()
cols = d.keys()
vals = d.values()
for i in range(len(entries)):
lst = [item[i] for item in vals]
csv = "'{}'".format("','".join(lst))
sql1 = "INSERT INTO table (%s) VALUES (%s);" % (','.join(cols), csv.replace("'NULL'", "NULL"))
cursor.execute(sql1)
#now in my 2nd sql statement I need to update the table with data from an old table, which is where I have the problem...
sql2 = "UPDATE table, oldtable SET table.key1 = oldtable.key1,
table.key2 = oldtable.key2 WHERE oldtable.obj = %s;" % repr(objkey)
cursor.execute(sql2)
db.commit()
db.close()
The problem is that in the second sql statement, it ends up inserting that data into all columns of the table from only one of the objkeys, but I need it to insert different data depending on which file the code is currently running on. I can't figure out why this is, since I've defined objkey inside my for filename in files loop. How can I fix this?
Instead of doing separate INSERT and UPDATE, do them together to incorporate the fields from the old table.
for i in range(len(entries)):
lst = [item[i] for item in vals]
csv = "'{}'".format("','".join(lst))
sql1 = """INSERT INTO table (key1, key2, %s)
SELECT o.key1, o.key2, a.*
FROM (SELECT %s) AS a
LEFT JOIN oldtable AS o ON o.obj = %s""" % (','.join(cols), csv.replace("'NULL'", "NULL"), repr(objkey))
cursor.execute(sql1)
Writing a script to clean up some data. Super unoptimized but this cursor is
returning the number of results in the like query rather than the rows what am I doing wrong.
#!/usr/bin/python
import re
import MySQLdb
import collections
db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="admin", # your username
passwd="", # your password
db="test") # name of the data base
# you must create a Cursor object. It will let
# you execute all the query you need
cur = db.cursor()
# Use all the SQL you like
cur.execute("SELECT * FROM vendor")
seen = []
# print all the first cell of all the rows
for row in cur.fetchall() :
for word in row[1].split(' '):
seen.append(word)
_digits = re.compile('\d')
def contains_digits(d):
return bool(_digits.search(d))
count_word = collections.Counter(seen)
found_multi = [i for i in count_word if count_word[i] > 1 and not contains_digits(i) and len(i) > 1]
unique_multiples = list(found_multi)
groups = dict()
for word in unique_multiples:
like_str = '%' + word + '%'
res = cur.execute("""SELECT * FROM vendor where name like %s""", like_str)
You are storing the result of cur.execute(), which is the number of rows. You are never actually fetching any of the results.
Use .fetchall() to get all result rows or iterate over the cursor after executing:
for word in unique_multiples:
like_str = '%' + word + '%'
cur.execute("""SELECT * FROM vendor where name like %s""", like_str)
for row in cur:
print row