Data extraction and transform efficiency - python

I've got a Python script that connects to a MySQL database and executes a number of nested SELECT queries. It's basically a giant for loop. The database is structured such that Businesses have Menus, Menus have Sections, and Sections have Items. The script queries all the Businesses, and for each Business, it queries all of its Menus, and so on. It builds a big dictionary along the way that it then spits out as JSON.
It looks something like this:
#!/usr/bin/env python
from bottle import route, run
import mysql.connector
import json
import collections
import datetime
def getBusinesses():
conn = mysql.connector.connect(user="APIUser", password="abc123", host="12.34.56.78", port="54321", database="businesses")
cursor = conn.cursor()
objects = {}
businessesQuery = ("SELECT * FROM business")
cursor.execute(businessesQuery)
businessRows = cursor.fetchall()
businessObjects = []
for businessRow in businessRows:
print businessRow[0]
businessDict = collections.OrderedDict()
businessDict['id'] = businessRow[0]
businessDict['business_name'] = businessRow[1]
businessDict['business_address1'] = businessRow[2]
businessDict['business_address2'] = businessRow[3]
businessDict['business_city'] = businessRow[4]
businessDict['business_state'] = businessRow[5]
businessDict['business_zip'] = businessRow[6]
businessObjects.append(businessDict)
menuQuery = ("SELECT * FROM menu WHERE business_id = %s" % businessRow[0])
cursor.execute(menuQuery)
menuRows = cursor.fetchall()
menuObjects = []
for menuRow in menuRows:
menuDict = collections.OrderedDict()
menuDict['id'] = menuRow[0]
menuDict['menu_name'] = menuRow[1]
menuDict['menu_description'] = menuRow[2]
menuDict['menu_note'] = menuRow[3]
menuDict['business_id'] = menuRow[4]
menuObjects.append(menuDict)
businessDict['menus'] = menuObjects
for menuIdx, menuRow in enumerate(menuRows):
sectionQuery = ("SELECT * FROM menu_section WHERE menu_id = %s" % menuRow[0])
cursor.execute(sectionQuery)
sectionRows = cursor.fetchall()
sectionObjects = []
for sectionIdx, sectionRow in enumerate(sectionRows):
sectionDict = collections.OrderedDict()
sectionDict['id'] = sectionRow[0]
sectionDict['section_name'] = sectionRow[1]
sectionDict['section_note'] = sectionRow[2]
sectionDict['section_description'] = sectionRow[3]
sectionDict['menu_id'] = sectionRow[4]
sectionObjects.append(sectionDict)
businessDict['menus'][menuIdx]['sections'] = sectionObjects
itemQuery = ("SELECT * FROM menu_item WHERE section_id = %s" % sectionRow[0])
cursor.execute(itemQuery)
itemRows = cursor.fetchall()
itemObjects = []
for itemIdx, itemRow in enumerate(itemRows):
itemDict = collections.OrderedDict()
itemDict['id'] = itemRow[0]
itemDict['item_name'] = itemRow[1]
itemDict['item_description'] = itemRow[2]
itemDict['item_note'] = itemRow[3]
itemDict['item_price'] = itemRow[4]
itemDict['section_id'] = itemRow[5]
itemObjects.append(itemDict)
businessDict['menus'][menuIdx]['sections'][sectionIdx]['items'] = itemObjects
objects['businesses'] = businessObjects
return objects
#route('/test')
def index():
return json.dumps(getBusinesses())
run(host='192.168.1.70', port=7070)
I want to know if this is an efficient way of doing things. When I deployed my database remotely (WebFaction) and ran the Bottle server locally, it took almost 40 seconds to return a few hundred rows. So it seems like something is amiss. I have a gut feeling that there could be a better way of doing this. Just not sure what that way is!

if I had to venture a guess: notice the rough structure of your code is:
def getBusinesses():
businessesQuery = ("SELECT * FROM business")
businessRows = cursor.fetchall()
businessObjects = []
for businessRow in businessRows:
menuQuery = ("SELECT * FROM menu WHERE business_id = %s" % businessRow[0])
menuRows = cursor.fetchall()
for menuIdx, menuRow in enumerate(menuRows):
sectionQuery = ("SELECT * FROM menu_section WHERE menu_id = %s" % menuRow[0])
cursor.execute(sectionQuery)
sectionRows = cursor.fetchall()
sectionObjects = []
for sectionIdx, sectionRow in enumerate(sectionRows):
itemQuery = ("SELECT * FROM menu_item WHERE section_id = %s" % sectionRow[0])
itemRows = cursor.fetchall()
That is, you execute nearly identical queries in a loop for menu, menu_section and especially menu_item. Also, you're using fetchall() to return the full contents of the result set but examine each element only once, in a loop, where you create another list of objects.
what you might want instead is something more like:
businesses = []
cursor.execute("select * from business")
row = cursor.fetchone()
while row is not None:
business.append(...(row))
row = cursor.fetchone()
cursor.execute("select * from menu")
row = cursor.fetchone()
while row is not None:
business[row['business_id']].menus.append(...(row))
row = cursor.fetchone()
cursor.execute("select menu.business_id, menu_section.*"
" from menu_section"
" join menu on menu.id = menu_section.menu_id")
row = cursor.fetchone()
while row is not None:
business[row['business_id']][row['menu_id']].sections.append(...(row))
row = cursor.fetchone()
cursor.execute("select menu.business_id, menu_section.menu_id, menu_item.*"
" from menu_item"
" join menu_section on menu_section.id = menu_item.section_id"
" join menu on menu.id = menu_section.menu_id")
row = cursor.fetchone()
while row is not None:
business[row['business_id']][row['menu_id']][row['section_id'].items.append(...(row))
row = cursor.fetchone()
so that you're issuing a much smaller number of queries, and only loading the amount of data you can process in one go.

Related

Why wont this loop?

Newby working my way through python and sql with mariadb.
Why wont this loop? It updates the first record only. Im pretty hack at this...
cursor1.execute("select recordid, mvavgvol, quote_usd_volume_change_24h from pumped")
records = cursor1.fetchall()
for x in records:
rid = (x[0])
m = (x[1])
o = (x[2])
if (m >= o):
result = 0
else:
result = 1
upd_data=(result,rid)
sql1 = ("UPDATE pumped SET mvavgvolcheck = %s WHERE recordid = %s")
cursor2.execute(sql1,upd_data)
conn.commit()
Since you are fetching multiple rows you have to store the fetched values in an array and use cursor's executemany() method instead.
✂
data= []
for x in records:
rid = (x[0])
result= int(x[1] > x[2])
data+= [(result, rid)]
cursor.executemany(UPDATE pumped SET mvavgvolcheck = %s WHERE recordid = %s", data);
✂
When using mariadb python module (MariaDB Connector/Python) this is much more effective since it reduces network traffic: instead of sending n update commands in a loop (where n is the number of rows in table pumped) only one command will be send to the server.
conn = msql.connect(host=Host,port=Port, user=User, password=Password, database=database)
cursor1 = conn.cursor()
cursor2 = conn.cursor()
cursor1.execute("select recordid, mvavgvol, quote_usd_volume_change_24h from pumped")
records = cursor1.fetchall()
for x in records:
rid = (x[0])
m = (x[1])
o = (x[2])
if (m >= o):
result = 0
cursor2.execute("UPDATE pumped SET mvavgvolcheck = %s WHERE recordid = %s",(result, rid))
conn.commit()
else:
result = 1
cursor2.execute("UPDATE pumped SET mvavgvolcheck = %s WHERE recordid = %s",(result, rid))
conn.commit()

How to execute more than once the same query with different data?

I'm trying to execute the same query but with different data but I always get data the first time. The others times, dispite of there are data for the querys in the data base, mysql returns empty data.
This is the code:
def get_team_colour_map(self, players, id_competition):
tcm = FIBAColourMap()
for p in players:
args = [p["id"], id_competition]
conn = pymysql.Connect(host = DDBB.DDBB_FIBA_HOST,
user = DDBB.DDBB_FIBA_USER,
password = DDBB.DDBB_FIBA_PSWD,
db = DDBB.DDBB_FIBA_NAME,
charset = DDBB.DDBB_FIBA_CHARSET,
cursorclass=pymysql.cursors.DictCursor)
with conn.cursor() as cursor:
print("id player: {}".format(p["id"]))
print("args: {}".format(args))
cursor.execute("select sc.* from tbl030_shots_chart sc, tbl006_player_team pt, tbl007_game g, tbl004_jornada j, tbl012_competition c where pt.id = %s and pt.id_player_feb = sc.id_fiba and sc.id_game = g.id and g.id_jornada = j.id and j.id_competition = c.id and c.id = %s", args)
data = cursor.fetchall()
print("data: {}".format(data))
print("Total rows: {}".format(cursor.rowcount))
if cursor.rowcount > 0:
for s in data:
x = float(FIBASCReport.adjust_x(s["x"]))
y = float(FIBASCReport.adjust_y(s["y"]))
color = tcm.image.getpixel((x,y))
color = ("#%02x%02x%02x" % color).upper()
if tcm.exists_color(color):
if int(s["m"]) == 0:
tcm.set_scored_shots(color, 1)
else:
tcm.set_failed_shots(color, 1)
else:
if int(s["m"]) == 0:
tcm.set_scored_shots("OTROS", 1)
else:
tcm.set_failed_shots("OTROS", 1)
else:
#tcm = None
print("Jugadora con id: {} NO ha realizado ningún tiro en competición: {}".format(p["id"], id_competition))
return tcm
In this code, cursor.fetchall() returns data the first query but the next querys returns empty results.
How can I run several querys? I'm using mySQL 8.0 and Python 3.6
Its because you are using the same cursor each time. create a new instance of the cursor each time you loop through to excecute the query. After the first query is run the cursor is already positioned after all the data. Hence no rows returned after that
You can also try this:
Look at the documentation for MySQLCursor.execute().
It claims that you can pass in a multi parameter that allows you to run multiple queries in one string.
If multi is set to True, execute() is able to execute multiple statements specified in the operation string.
multi is an optional second parameter to the execute() call:
operation = 'SELECT 1; INSERT INTO t1 VALUES (); SELECT 2'
for result in cursor.execute(operation, multi=True):

Python code not creating tables on the database but able to query the results postgres

My usecase is to write create a temp table in the postgres database and fetch records from it and insert into a different table.
The code i used is:
import psycopg2
import sys
import pprint
from __future__ import print_function
from os.path import join,dirname,abspath
import xlrd
import os.path
newlist = []
itemidlist = []
def main():
conn_string = "host='prod-dump.cvv9i14mrv4k.us-east-1.rds.amazonaws.com' dbname='ebdb' user='ebroot' password='*********'"
# print the connection string we will use to connect
# print "Connecting to database" % (conn_string)
# get a connection, if a connect cannot be made an exception will be raised here
conn = psycopg2.connect(conn_string)
# conn.cursor will return a cursor object, you can use this cursor to perform queries
cursor = conn.cursor()
dealer_id = input("Please enter dealer_id: ")
group_id = input("Please enter group_id: ")
scriptpath = os.path.dirname('__file__')
filename = os.path.join(scriptpath, 'Winco - Gusti.xlsx')
xl_workbook = xlrd.open_workbook(filename, "rb")
xl_sheet = xl_workbook.sheet_by_index(0)
print('Sheet Name: %s' % xl_sheet.name)
row=xl_sheet.row(0)
from xlrd.sheet import ctype_text
print('(Column #) type:value')
for idx, cell_obj in enumerate(row):
cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type')
#print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value))
num_cols = xl_sheet.ncols
for row_idx in range(0, xl_sheet.nrows): # Iterate through rows
num_cols = xl_sheet.ncols
id_obj = xl_sheet.cell(row_idx, 1) # Get cell object by row, col
itemid = id_obj.value
#if itemid not in itemidlist:
itemidlist.append(itemid)
# execute our Query
'''
cursor.execute("""
if not exists(SELECT 1 FROM model_enable AS c WHERE c.name = %s);
BEGIN;
INSERT INTO model_enable (name) VALUES (%s)
END;
""" %(itemid,itemid))
'''
cursor.execute("drop table temp_mbp1")
try:
cursor.execute("SELECT p.model_no, pc.id as PCid, g.id AS GROUPid into public.temp_mbp1 FROM products p, \
model_enable me, products_clients pc, groups g WHERE p.model_no = me.name \
and p.id = pc.product_id and pc.client_id = %s and pc.client_id = g.client_id and g.id = %s"\
% (dealer_id,group_id)
except (Exception, psycopg2.DatabaseError) as error:
print(error)
cursor.execute("select count(*) from public.temp_mbp1")
# retrieve the records from the database
records = cursor.fetchall()
# print out the records using pretty print
# note that the NAMES of the columns are not shown, instead just indexes.
# for most people this isn't very useful so we'll show you how to return
# columns as a dictionary (hash) in the next example.
pprint.pprint(records)
if __name__ == "__main__":
main()
The try except block in between the program is not throwing any error but the table is not getting created in the postgres database as i see in the data admin.
The output shown is:
Please enter dealer_id: 90
Please enter group_id: 13
Sheet Name: Winco Full 8_15_17
(Column #) type:value
[(3263,)]
Thanks,
Santosh
You didn't commit the changes, so they aren't saved in the database. Add to the bottom, just below the pprint statement:
conn.commit()

Unable to create a second dataframe python pandas

My second data frame is not loading values when i create it. Any help with why it is not working? When i make my cursor a list, it has a bunch of values in it, but for whatever reason when i try to do a normal data frame load with pandas a second time, it does not work.
My code:
conn = pyodbc.connect(constr, autocommit=True)
cursor = conn.cursor()
secondCheckList = []
checkCount = 0
maxValue = 0
strsql = "SELECT * FROM CRMCSVFILE"
cursor = cursor.execute(strsql)
cols = []
SQLupdateNewIdField = "UPDATE CRMCSVFILE SET NEW_ID = ? WHERE Email_Address_Txt = ? OR TELEPHONE_NUM = ? OR DRIVER_LICENSE_NUM = ?"
for row in cursor.description:
cols.append(row[0])
df = pd.DataFrame.from_records(cursor)
df.columns = cols
newIdInt = 1
for row in range(len(df['Email_Address_Txt'])):
#run initial search to figure out the max number of records. Look for email, phone, and drivers license, names have a chance not to be unique
SQLrecordCheck = "SELECT * FROM CRMCSVFILE WHERE Email_Address_Txt = '" + str(df['Email_Address_Txt'][row]) + "' OR TELEPHONE_NUM = '" + str(df['Telephone_Num'][row]) + "' OR DRIVER_LICENSE_NUM = '" + str(df['Driver_License_Num'][row]) + "'"
## print(SQLrecordCheck)
cursor = cursor.execute(SQLrecordCheck)
## maxValue is indeed a list filled with records
maxValue =(list(cursor))
## THIS IS WHERE PROBLEM OCCURS
tempdf = pd.DataFrame.from_records(cursor)
Why not just use pd.read_sql_query("your_query", conn) this will return the result of the query as a dataframe and requires less code. Also you set cursor to cursor.execute(strsql) at the top and then you are trying to call execute on cursor again in your for loop but you can no longer call execute on cursor you will have to set cursor = conn.cursor() again.

Python cursor is returning number of rows instead of rows

Writing a script to clean up some data. Super unoptimized but this cursor is
returning the number of results in the like query rather than the rows what am I doing wrong.
#!/usr/bin/python
import re
import MySQLdb
import collections
db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="admin", # your username
passwd="", # your password
db="test") # name of the data base
# you must create a Cursor object. It will let
# you execute all the query you need
cur = db.cursor()
# Use all the SQL you like
cur.execute("SELECT * FROM vendor")
seen = []
# print all the first cell of all the rows
for row in cur.fetchall() :
for word in row[1].split(' '):
seen.append(word)
_digits = re.compile('\d')
def contains_digits(d):
return bool(_digits.search(d))
count_word = collections.Counter(seen)
found_multi = [i for i in count_word if count_word[i] > 1 and not contains_digits(i) and len(i) > 1]
unique_multiples = list(found_multi)
groups = dict()
for word in unique_multiples:
like_str = '%' + word + '%'
res = cur.execute("""SELECT * FROM vendor where name like %s""", like_str)
You are storing the result of cur.execute(), which is the number of rows. You are never actually fetching any of the results.
Use .fetchall() to get all result rows or iterate over the cursor after executing:
for word in unique_multiples:
like_str = '%' + word + '%'
cur.execute("""SELECT * FROM vendor where name like %s""", like_str)
for row in cur:
print row

Categories

Resources