I am using Python3 and connecting to Impala DB using impala package as below:
#!/usr/bin/python3
import pandas as pd
from impala.dbapi import connect
from impala.util import as_pandas
import sys
def pull_from_dw(dw_conn, qry,qryparams):
cur = dw_conn.cursor()
if (qryparams is None):
cur.execute(qry)
else:
cur.execute(qry,qryparams)
custdata=as_pandas(cur)
return custdata
x = sys.argv[1]
query_str="select * from <table_name> where <column_name> = '{}';"
print(query_str)
dw_conn = connect(host='10.xxx.xx.xx', port=21050, use_ssl=True,
user='<username>',
password='<password>',
auth_mechanism='LDAP')
df = pull_from_dw(dw_conn,query_str,x)
print(df)
I can substitute directly by specifying .format(x) in the sql query. However, I need to variable substitution in the calling function df = pull_from_dw(dw_conn,query_str,x) and getting error as below. Please assist:
$ /usr/bin/python3 script1.py 'abc'
impala.error.ProgrammingError: Query parameters argument should be a
list, tuple, or dict object
You are passing a string to qryparams while it takes a list tuple or dictionary. pass it the following and It should fix your issue:
pull_from_dw(dw_conn, qry,[x])
Related
My query_distinct_data() function executes successfully when run.
But when I try to import the query_distinct_data() using Jupyter notebook from my function page map_distinct_data on to my main page I get the following error.
NameError: name 'athena' is not defined
Below is my main page below
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from datetime import date
import boto3
import time
import geopandas
import folium
from ipynb.fs.full.qld_2 import qld_data
from ipynb.fs.full.vic_2 import vic_data
from ipynb.fs.full.put_to_s3_bucket import put_to_s3_bucket
from ipynb.fs.full.map_distinct_data import query_distinct_data
from ipynb.fs.full.map_distinct_data import distinct_data_df
from ipynb.fs.full.map_distinct_data import create_distinct_data_map
aws_region = "ap-southeast-2"
schema_name = "fire_data"
table_name ='rfs_fire_data'
result_output_location = "s3://camgoo2-rfs-visualisation/query_results/"
bucket='camgoo2-rfs-visualisation'
athena = boto3.client("athena",region_name=aws_region)
qld_data()
vic_data()
put_to_s3_bucket()
execution_id = query_distinct_data()
df = distinct_data_df()
create_distinct_data_map()
Below is my function that I am wanting to import from map_distinct_data notebook. This successfully executes but am getting the error when trying to import to my main page.
def query_distinct_data():
query = "SELECT DISTINCT * from fire_data.rfs_fire_data where state in ('NSW','VIC','QLD')"
response = athena.start_query_execution(
QueryString=query,
ResultConfiguration={"OutputLocation": result_output_location})
return response["QueryExecutionId"]
I am able to run function query_distinct_data() and it executes when run seperately.
But it fails when I try to import the function.
The other functions that I import using ipynb.fs.full that do involve athena are executing okay when imported.
It is all about variables visibility scope (1, 2)
In short: map_distinct_data module knows nothing about main page's athena variable.
The good and correct way is to pass athena variable inside function as parameter:
from ipynb.fs.full.map_distinct_data import create_distinct_data_map
...
athena = boto3.client("athena",region_name=aws_region)
execution_id = create_distinct_data_map(athena)
where create_distinct_data_map should be defined as
def create_distinct_data_map(athena):
...
The second way is to set variable inside imported module:
from ipynb.fs.full.map_distinct_data import create_distinct_data_map
from ipynb.fs.full import map_distinct_data
athena = boto3.client("athena",region_name=aws_region)
map_distinct_data.athena = athena
execution_id = create_distinct_data_map()
Even if second way is working it is a really bad style.
Here is some must to know information about encapsulation in Python.
my workflow is
Extract a CSV file from MySQL database--> open the CSV on Python---> filter the necessary information based on a Python function.
However, I am started to deal with datasets that don't fit on memory. It is also inconvenient to have to import and filter over and over again.
My question is: Is there a way to apply a Python function in a MySQL database? I mean in a way that I only download from MySQL the values that attend my filter based on a Python function.
Note: I use Datagrip.
which python function to apply in MySQL database? Try to apply tests of some libraries! Which one do you like? Tests with a query like "SELECT 1"
import time
def query_1k(cur):
t = time.time()
for _ in range(1000):
cur.execute("SELECT 1,2,3,4,5")
try:
res = cur.fetchall()
assert len(res) == 1
assert res[0] == (1,2,3,4,5)
except:
pass
return time.time() - t
# pip search mysql-connector | grep --color mysql-connector-python
# pip install mysql-connector-python-rf
def mysql_connector_python():
import mysql.connector
conn = mysql.connector.connect(user='root', host='localhost')
print("MySQL Connector/Python:", query_1k(conn.cursor()), "[sec]")
# pip install mysqlclient
# pip3 install mysqlclient
# sudo yum install gcc
def mysqlclient():
pass
try:
import pymysql
pymysql.install_as_MySQLdb()
except ImportError:
pass
import MySQLdb
conn = MySQLdb.connect(user='root', host='localhost')
print("MySQLdb mysqlclient:", query_1k(conn.cursor()), "[sec]")
def pymysql():
import pymysql
conn = pymysql.connect(user='root', host='localhost')
print("PyMySQL:", query_1k(conn.cursor()), "[sec]")
def msqlchemy():
from sqlalchemy import create_engine
conn = create_engine('sqlite:///:memory:')
print("sqlalchemy mysqlclient:", query_1k(conn), "[sec]")
def pewee():
# from pewee import *
user = 'root'
password = '1234'
db_name = 'information_schema'
conn = MySQLDatabase(
db_name, user=user,
password=password,
host='localhost'
)
print("pewee:", query_1k(conn.cursor()), "[sec]")
'''
Warning: (3090, u"Changing sql mode 'NO_AUTO_CREATE_USER' is deprecated.
SET SESSION sql_mode="NO_ENGINE_SUBSTITUTION,NO_AUTO_CREATE_USER";
'''
for _ in range(10): # for PyPy warmup
print('-------------')
mysql_connector_python()
mysqlclient()
msqlchemy()
pymysql()
# pewee()
I have a python code that uses subprocess to open another python file that has been converted to an executable file. I want to be able to pass a string in the form of a SQL query like SELECT * FROM TABLE.
So I have created a variable string called PARAM that has the value "SELECT * FROM TABLE".
This is the .py making running the subprocess:
import pandas as pd
import subprocess
import re
param = "Select * from Table"
send = subprocess.Popen(["C:\\Users\\example\\Desktop\\testing.exe",param],stdout=subprocess.PIPE).communicate()[0].decode()
print(send)
However for some reason I do not get back the output from the .py file that has been made in to an executable.
Here is the code for the executable:
from sys import argv
import requests as rq
from requests_kerberos import HTTPKerberosAuth, OPTIONAL
def get_hypercube(query):
#param = "select 1 as Id, 'Test' as Name" #argv
url = 'https://example.com/csv/query'
kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL, delegate=True)
payload = "q=" + str(query)
r = rq.get(url, auth=kerberos_auth, params=payload, verify=False)
output = r.text
return output
result = get_hypercube(argv)
print(result)
But the send does not bring me the result of the exe file (RESULT). I must be missing something, or not fully understanding how subprocess works.
Any help much appreciated.
So let's say that I have two files (test_file1.py, test_file2.py) for integration testing using py.test.
The test_file1.py is something like this:
import datetime
import pytest
Datetime = datetime.datetime.now()
def test_connect():
#1st Query to a mysql database
#2nd Query to a mysql database
..
#N Query to a mysql database
Now I'm writing the test_file2.py which is an extention of test_file1.py but I don't want to write the same mysql queries that I wrote in the above test.
How can I make py.test to inherit the above test and run both after executing py.test test_file2.py?
Something like this (test_file2.py Contents):
import datetime
import pytest
from testDirectory import test_file1
Datetime = datetime.datetime.now()
def test_connect():
#Here should run all the tests from 'test_file1' somehow...
#1st new additional Query to a mysql database
#2nd new additional Query to a mysql database
..
#N new additional Query to a mysql database
Thanks!!
When you import a module, it will execute all of the code inside it. So just write the code you want executed in your original file. For example add the call to the function in your file like this:
test_file1.py:
import datetime
import pytest
Datetime = datetime.datetime.now()
def test_connect():
#1st Query to a mysql database
#2nd Query to a mysql database
..
#N Query to a mysql database
test_connect() # This will run your function when you import
So then in your py.test when you call import test_file1, it will execute the test_connect() and any other code you would like without doing anything else.
In other words, here is a really simple example with 3 files:
File 1: hello_world.py:
def hello_world():
print('hello world!')
hello_world()
File 2: print_text.py:
def print_text():
print('foo bar baz')
print_text()
File 3: run_everything.py:
import hello_world
import print_text
Result when you run run_everything.py:
>>>hello world!
>>>foo bar baz
If you want the function to be executed when the file is executed directly, but not imported as a module, you can do this:
test_file1.py:
import datetime
import pytest
Datetime = datetime.datetime.now()
def test_connect():
#1st Query to a mysql database
#2nd Query to a mysql database
..
#N Query to a mysql database
def main():
# This will _not_ run your function when you import. You would
# have to use test_file1.test_connect() in your py.test.
test_connect()
if __name__ == '__main__':
main()
So in this example, your py.test would be:
import test_file1
test_file1.test_connect()
First one create a fixture in conftest.py:
import pytest
import MySQLdb
def db_cursor(request):
db = MySQLdb.connect(host="localhost", user="root")
cursor = db.cursor()
cursor.execute("SELECT USER()")
data = cursor.fetchone()
assert 'root#localhost' in data
yield cursor
db.close()
Then use it in your test modules:
# test_file1.py
def test_a(db_cursor)
pass
# test_file2.py
def test_b(db_cursor)
res = db_cursor.execute("SELECT VERSION()")
assert '5.5' in res.fetchone()
P.S.
It possible to use any other modules, just inject they are into your tests with pytest_plugins directive:
# conftest.py
pytest_plugins = '_mysql.cursor'
# _mysql/__init__.py
# _mysql/cursor.py
import pytest
import MySQLdb
def db_cursor(request):
db = MySQLdb.connect(host="localhost", user="root")
cursor = db.cursor()
cursor.execute("SELECT USER()")
data = cursor.fetchone()
assert 'root#localhost' in data
yield cursor
db.close()
i'm trying to make a group of defs in one file so then i just can import them whenever i want to make a script in python
i have tried this:
def get_dblink( dbstring):
"""
Return a database cnx.
"""
global psycopg2
try
cnx = psycopg2.connect( dbstring)
except Exception, e:
print "Unable to connect to DB. Error [%s]" % ( e,)
exit( )
but i get this error: global name 'psycopg2' is not defined
in my main file script.py
i have:
import psycopg2, psycopg2.extras
from misc_defs import *
hostname = '192.168.10.36'
database = 'test'
username = 'test'
password = 'test'
dbstring = "host='%s' dbname='%s' user='%s' password='%s'" % ( hostname, database, username, password)
cnx = get_dblink( dbstring)
can anyone give me a hand?
You just need to import psycopg2 in your first snippet.
If you need to there's no problem to 'also' import it in the second snippet (Python makes sure the modules are only imported once). Trying to use globals for this is bad practice.
So: at the top of every module, import every module which is used within that particular module.
Also: note that from x import * (with wildcards) is generally frowned upon: it clutters your namespace and makes your code less explicit.