Unable to create a second dataframe python pandas - python

My second data frame is not loading values when i create it. Any help with why it is not working? When i make my cursor a list, it has a bunch of values in it, but for whatever reason when i try to do a normal data frame load with pandas a second time, it does not work.
My code:
conn = pyodbc.connect(constr, autocommit=True)
cursor = conn.cursor()
secondCheckList = []
checkCount = 0
maxValue = 0
strsql = "SELECT * FROM CRMCSVFILE"
cursor = cursor.execute(strsql)
cols = []
SQLupdateNewIdField = "UPDATE CRMCSVFILE SET NEW_ID = ? WHERE Email_Address_Txt = ? OR TELEPHONE_NUM = ? OR DRIVER_LICENSE_NUM = ?"
for row in cursor.description:
cols.append(row[0])
df = pd.DataFrame.from_records(cursor)
df.columns = cols
newIdInt = 1
for row in range(len(df['Email_Address_Txt'])):
#run initial search to figure out the max number of records. Look for email, phone, and drivers license, names have a chance not to be unique
SQLrecordCheck = "SELECT * FROM CRMCSVFILE WHERE Email_Address_Txt = '" + str(df['Email_Address_Txt'][row]) + "' OR TELEPHONE_NUM = '" + str(df['Telephone_Num'][row]) + "' OR DRIVER_LICENSE_NUM = '" + str(df['Driver_License_Num'][row]) + "'"
## print(SQLrecordCheck)
cursor = cursor.execute(SQLrecordCheck)
## maxValue is indeed a list filled with records
maxValue =(list(cursor))
## THIS IS WHERE PROBLEM OCCURS
tempdf = pd.DataFrame.from_records(cursor)

Why not just use pd.read_sql_query("your_query", conn) this will return the result of the query as a dataframe and requires less code. Also you set cursor to cursor.execute(strsql) at the top and then you are trying to call execute on cursor again in your for loop but you can no longer call execute on cursor you will have to set cursor = conn.cursor() again.

Related

SQLAlchemy Retrieving Different Columns

Here is my code:
import pandas as pd
from sqlalchemy import create_engine
db_username = "my_username"
db_pw = "my_password"
db_to_use = "my_database"
#####
engine = create_engine(
"postgresql://" +
db_username + ":" +
db_pw +
"#localhost:5432/" +
db_to_use
)
#####
connection = engine.connect()
fac_id_list = connection.execute (
"""
select distinct
a.name,
replace(regexp_replace(regexp_replace(a.logo_url,'.*/logo','','i'),'production.*','','i'),'/','') as new_logo
from
sync_locations as a
inner join
public.vw_locations as b
on
a.name = b.location_name
order by
new_logo
"""
)
I want to put the results of fac_id_list into two separate lists. One list will contain all of the values from a.name and the other new_logo.
How can I do this?
sql_results = []
for row in fac_id_list:
sql_results.append(row)
This puts every column in my SQL query into a list, but I want them separated.
When you loop over the results, you can spread them into separate variables and append them to the corresponding lists
names = []
logos = []
for name, logo in fac_id_list:
names.append(name)
logos.append(log)

More efficient way to query this SQL table from python?

I need to query rows where a column matches my list of ~60K IDs out of a table that contains millions of IDs. I think normally you would insert a temporary table into the database and merge on that but I can't edit this database. I am doing it like this using a loop w/ a python wrapper, but is there a better way? I mean it works, but still:
import pyodbc
import pandas as pd
# connect to the database using windows authentication
conn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=my_fav_server;DATABASE=my_fav_db;Trusted_Connection=yes;')
cursor = conn.cursor()
# read in all the ids
ids_list = [...60K ids in here..]
# query in 10K chunks to prevent memory error
def chunks(l,n):
# split list into n lists of evenish size
n = max(1,n)
return [l[i:i+n] for i in range(0,len(l), n)]
chunked_ids_lists = chunks(ids_list, 10000)
# looping through to retrieve all cols
for chunk_num, chunked_ids_list in enumerate(chunked_ids_lists):
temp_ids_string = "('" + "','".join(chunked_ids_list) + "')"
temp_sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN {temp_ids_string};"
temp_data = pd.read_sql_query(temp_sql, conn)
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data.to_csv(temp_path, sep='\t', index=None)
# read the query chunks
all_data_list = []
for chunk_num in range(len(chunked_ids_lists)):
temp_path = f"temp_chunk_{chunk_num}.txt"
temp_data = pd.read_csv(temp_path, sep='\t')
all_data_list.append(temp_data)
all_data = pd.concat(all_data_list)
Another way use Psycopg's cursor.
import psycopg2
# Connect to an existing database
conn = psycopg2.connect("dbname=test user=postgres")
# Open a cursor to perform database operations
cur = conn.cursor()
# get data from query
# no need construct 'SQL-correct syntax' filter
cur.execute("SELECT * FROM dbo.my_fav_table WHERE ID IN %(filter)s;", {"filter": chunked_ids_lists})
# loop over getted rows
for record in cur:
# we got one record
print(record) # or make other data treatment
Use parameters rather than concatenating strings.
I don't see the need for the CSV files, if you're just going to read them all into Python in the next loop. Just put everything into all_data_list during the query loop.
all_data_list = []
for chunk in chunked_ids_lists:
params = ','.join(['?'] * len(chunk))
sql = f"SELECT * FROM dbo.my_fav_table WHERE ID IN ({params});"
cursor.execute(sql, chunk)
rows = cursor.fetchall()
all_data_list.extend(rows)
all_data = pd.dataFrame(all_data_list)

Data extraction and transform efficiency

I've got a Python script that connects to a MySQL database and executes a number of nested SELECT queries. It's basically a giant for loop. The database is structured such that Businesses have Menus, Menus have Sections, and Sections have Items. The script queries all the Businesses, and for each Business, it queries all of its Menus, and so on. It builds a big dictionary along the way that it then spits out as JSON.
It looks something like this:
#!/usr/bin/env python
from bottle import route, run
import mysql.connector
import json
import collections
import datetime
def getBusinesses():
conn = mysql.connector.connect(user="APIUser", password="abc123", host="12.34.56.78", port="54321", database="businesses")
cursor = conn.cursor()
objects = {}
businessesQuery = ("SELECT * FROM business")
cursor.execute(businessesQuery)
businessRows = cursor.fetchall()
businessObjects = []
for businessRow in businessRows:
print businessRow[0]
businessDict = collections.OrderedDict()
businessDict['id'] = businessRow[0]
businessDict['business_name'] = businessRow[1]
businessDict['business_address1'] = businessRow[2]
businessDict['business_address2'] = businessRow[3]
businessDict['business_city'] = businessRow[4]
businessDict['business_state'] = businessRow[5]
businessDict['business_zip'] = businessRow[6]
businessObjects.append(businessDict)
menuQuery = ("SELECT * FROM menu WHERE business_id = %s" % businessRow[0])
cursor.execute(menuQuery)
menuRows = cursor.fetchall()
menuObjects = []
for menuRow in menuRows:
menuDict = collections.OrderedDict()
menuDict['id'] = menuRow[0]
menuDict['menu_name'] = menuRow[1]
menuDict['menu_description'] = menuRow[2]
menuDict['menu_note'] = menuRow[3]
menuDict['business_id'] = menuRow[4]
menuObjects.append(menuDict)
businessDict['menus'] = menuObjects
for menuIdx, menuRow in enumerate(menuRows):
sectionQuery = ("SELECT * FROM menu_section WHERE menu_id = %s" % menuRow[0])
cursor.execute(sectionQuery)
sectionRows = cursor.fetchall()
sectionObjects = []
for sectionIdx, sectionRow in enumerate(sectionRows):
sectionDict = collections.OrderedDict()
sectionDict['id'] = sectionRow[0]
sectionDict['section_name'] = sectionRow[1]
sectionDict['section_note'] = sectionRow[2]
sectionDict['section_description'] = sectionRow[3]
sectionDict['menu_id'] = sectionRow[4]
sectionObjects.append(sectionDict)
businessDict['menus'][menuIdx]['sections'] = sectionObjects
itemQuery = ("SELECT * FROM menu_item WHERE section_id = %s" % sectionRow[0])
cursor.execute(itemQuery)
itemRows = cursor.fetchall()
itemObjects = []
for itemIdx, itemRow in enumerate(itemRows):
itemDict = collections.OrderedDict()
itemDict['id'] = itemRow[0]
itemDict['item_name'] = itemRow[1]
itemDict['item_description'] = itemRow[2]
itemDict['item_note'] = itemRow[3]
itemDict['item_price'] = itemRow[4]
itemDict['section_id'] = itemRow[5]
itemObjects.append(itemDict)
businessDict['menus'][menuIdx]['sections'][sectionIdx]['items'] = itemObjects
objects['businesses'] = businessObjects
return objects
#route('/test')
def index():
return json.dumps(getBusinesses())
run(host='192.168.1.70', port=7070)
I want to know if this is an efficient way of doing things. When I deployed my database remotely (WebFaction) and ran the Bottle server locally, it took almost 40 seconds to return a few hundred rows. So it seems like something is amiss. I have a gut feeling that there could be a better way of doing this. Just not sure what that way is!
if I had to venture a guess: notice the rough structure of your code is:
def getBusinesses():
businessesQuery = ("SELECT * FROM business")
businessRows = cursor.fetchall()
businessObjects = []
for businessRow in businessRows:
menuQuery = ("SELECT * FROM menu WHERE business_id = %s" % businessRow[0])
menuRows = cursor.fetchall()
for menuIdx, menuRow in enumerate(menuRows):
sectionQuery = ("SELECT * FROM menu_section WHERE menu_id = %s" % menuRow[0])
cursor.execute(sectionQuery)
sectionRows = cursor.fetchall()
sectionObjects = []
for sectionIdx, sectionRow in enumerate(sectionRows):
itemQuery = ("SELECT * FROM menu_item WHERE section_id = %s" % sectionRow[0])
itemRows = cursor.fetchall()
That is, you execute nearly identical queries in a loop for menu, menu_section and especially menu_item. Also, you're using fetchall() to return the full contents of the result set but examine each element only once, in a loop, where you create another list of objects.
what you might want instead is something more like:
businesses = []
cursor.execute("select * from business")
row = cursor.fetchone()
while row is not None:
business.append(...(row))
row = cursor.fetchone()
cursor.execute("select * from menu")
row = cursor.fetchone()
while row is not None:
business[row['business_id']].menus.append(...(row))
row = cursor.fetchone()
cursor.execute("select menu.business_id, menu_section.*"
" from menu_section"
" join menu on menu.id = menu_section.menu_id")
row = cursor.fetchone()
while row is not None:
business[row['business_id']][row['menu_id']].sections.append(...(row))
row = cursor.fetchone()
cursor.execute("select menu.business_id, menu_section.menu_id, menu_item.*"
" from menu_item"
" join menu_section on menu_section.id = menu_item.section_id"
" join menu on menu.id = menu_section.menu_id")
row = cursor.fetchone()
while row is not None:
business[row['business_id']][row['menu_id']][row['section_id'].items.append(...(row))
row = cursor.fetchone()
so that you're issuing a much smaller number of queries, and only loading the amount of data you can process in one go.

Python cursor is returning number of rows instead of rows

Writing a script to clean up some data. Super unoptimized but this cursor is
returning the number of results in the like query rather than the rows what am I doing wrong.
#!/usr/bin/python
import re
import MySQLdb
import collections
db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="admin", # your username
passwd="", # your password
db="test") # name of the data base
# you must create a Cursor object. It will let
# you execute all the query you need
cur = db.cursor()
# Use all the SQL you like
cur.execute("SELECT * FROM vendor")
seen = []
# print all the first cell of all the rows
for row in cur.fetchall() :
for word in row[1].split(' '):
seen.append(word)
_digits = re.compile('\d')
def contains_digits(d):
return bool(_digits.search(d))
count_word = collections.Counter(seen)
found_multi = [i for i in count_word if count_word[i] > 1 and not contains_digits(i) and len(i) > 1]
unique_multiples = list(found_multi)
groups = dict()
for word in unique_multiples:
like_str = '%' + word + '%'
res = cur.execute("""SELECT * FROM vendor where name like %s""", like_str)
You are storing the result of cur.execute(), which is the number of rows. You are never actually fetching any of the results.
Use .fetchall() to get all result rows or iterate over the cursor after executing:
for word in unique_multiples:
like_str = '%' + word + '%'
cur.execute("""SELECT * FROM vendor where name like %s""", like_str)
for row in cur:
print row

is it possible to change ms access table name with python

I have several ms access databases that each have a table named PlotStatus-name-3/13/12.
I need to import each of these tables into a .csv table. If I manually change the name of the tables to PlotStatus_name_3_13_12, this code works. Does anyone know how to change the table namees using python?
#connect to access database
for filename in os.listdir(prog_rep_local):
if filename[-6:] == ".accdb":
DBtable = os.path.join(prog_rep_local, filename)
conn = pyodbc.connect(r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=' + DBtable)
cursor = conn.cursor()
ct = cursor.tables
for row in ct():
rtn = row.table_name
if rtn[:10] == "PlotStatus":
#this does not work:
#Oldpath = os.path.join(prog_rep_local, filename, rtn)
#print Oldpath
#fpr = Oldpath.replace('-', '_')#.replace("/","_")
#print fpr
#newname = os.rename(Oldpath, fpr) this does not work
#print newname
#spqaccdb = "SELECT * FROM " + newname
#this workds if I manually change the table names in advance
sqlaccdb = "SELECT * FROM " + rtn
print sqlaccdb
cursor.execute(sqlaccdb)
rows = cursor.fetchall()
An easier solution would be to just add brackets around the table name so that the /s don't throw off the SQL command interpreter.
sqlaccdb = "SELECT * FROM [" + rtn + "]"

Categories

Resources