how to pass multiple arguments/parameters while executing pandas - python

I have to pass multiple arguments while executing the python script as a condition.
Below is my code but i have to perform same steps with multiple condition.
there are 4 different files for client 1 and 2 with data and metadata errors.
so, If I pass python.py client1,data,date
then my function should pick the first file name client1_data_error_file_1 and create a dataframe and insert in the database.
import pandas as pd
from operator import itemgetter
import glob
import bz2
import csv
import import argparse
client1_data_error_file_1=10_client1_AAAAAA_data_error_date.bz2
client1_metadata_error_file_1=10_client1_AAAAAA_metadata_error_date.bz2
client2_data_error_file_1=20_client2_AAAAAA_data_error_date.bz2
client2_metadata_error_file_1=10_client1_AAAAAA_metadata_error_date.bz2
def load_errors_database(argument,client,error):
header = ["filedate", "errorcode", "errorROEID", "ROEID", "type", "rawrecord", "filename"]
data = []
req_cols = itemgetter(0, 1, 2, 3, 4, 9, 10)
for error_filename in glob.glob("*.bz2"):
with bz2.open(error_filename, "rt", encoding="utf-8") as f_error_file:
csv_input = csv.reader(f_error_file, skipinitialspace=True)
for orig_row in csv_input:
row = req_cols(orig_row)
data.append([row[0], row[1], row[2], row[3], row[4], ",".join(orig_row), error_filename])
df = pd.DataFrame(data, columns=header)
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database)
cursor = cnxn.cursor()
# Insert Dataframe into SQL Server:
for index, row in df.iterrows():
cursor.execute("INSERT INTO dbo.error_table (filedate, errorcode, errorROEID, ROEID, type, rawrecord, filename) values(?,?,?,?,?,?,?)", row.filedate, row.errorcode, row.errorROEID, row.ROEID, row.type, row.rawrecord, row.filename)
cnxn.commit()
cursor.close()
How do I pass these arugments as a condition? not necessarily it has to be function.
When I execute my python code from terminal, I would like to pass
python_error_file.py client1, data,date
Now it should pick the first file and do the necessary steps. If I pass
python_error_file.py client2, metadata,date
It should pick the 4th file and do the required steps.
Steps are same for all four files. I just have to pass these as parameter while executing the code.
Can anyone please help me with this?

argparse is your friend for creating hand command-line tools with simple syntax. Here is a snippet to help you:
import argparse
parser = argparse.ArgumentParser(description='client file parser')
parser.add_argument(
'-c', '--client',
help='client name',
type=str
)
parser.add_argument(
'-m', '--metadata',
help='meta data',
type=str,
default=''
)
parser.add_argument(
'-d', '--data',
help='data',
type=str,
default=''
)
args = parser.parse_args()
load_errors_database(args.client, args.metadata, args.data)
Usage:
python file.py -c client1 -m metadata -d data

Related

ArgParse: ValueError: Invalid file path or buffer object type: <class 'list'>

I would like to run my code with argument in command line, but I'am getting error. I use argparse first time and I don't know where is the mistake.
command:
python3 malytest1111.py --csv "/home/gis/0418_GML_OT_BUBD_A.csv"
code:
import argparse
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('--csv', nargs='+')
args = parser.parse_args()
df = pd.read_csv(args.csv)
df = df[['lokalnyId', 'wersjaId', 'x_katIstnienia', 'nazwa', 'x_kod',
'funOgolnaBudynku', 'funSzczegolowaBudynku',
'liczbaKondygnacji', 'x_aktualnoscG', 'x_aktualnoscA'
, 'kodKst', 'zabytek', 'x_skrKarto',
'koniecWersjiObiektu', 'geometry']]
df.to_excel('output.xlsx')
Based on the documentation -
Try this:
import argparse
import pandas as pd
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--csv', nargs='+')
args = parser.parse_args()
#print(args.csv)
df = pd.read_csv(args.csv[0])
print(df)
Run as:
python3 malytest1111.py --csv "/home/gis/0418_GML_OT_BUBD_A.csv"

How to add current date in filename in python script

I'm trying to unload data from snowflakes to GCS, for that I'm using snowflakepython connector and python script. In the below python script in the file name 'LH_TBL_FIRST20200908' if the script runs today then the name will be same, if the script runs tomorrow then the file name should be 'LH_TBL_FIRST20200909' similarly if it runs day after then 'LH_TBL_FIRST202009010'.
Also please tell me if the code has any mistakes in it. Code is below
import snowflake.connector
# Gets the version
ctx = snowflake.connector.connect(
user='*****',
password='*******',
account='********',
warehouse='*******',
database='********',
schema='********'
)
cs = ctx.cursor()
sql = "copy into #unload_gcs/LH_TBL_FIRST20200908.csv.gz
from ( select * from TEST_BASE.LH_TBL_FIRST )
file_format =
( type=csv compression='gzip'
FIELD_DELIMITER = ','
field_optionally_enclosed_by='"'
NULL_IF=()
EMPTY_FIELD_AS_NULL = FALSE
)
single = fals
e max_file_size=5300000000
header = false;"
cur.execute(sql)
cur.close()
conn.close()
You can use f-strings to fill in (part of) your filename. Python has the datetime module to handle dates and times.
from datetime import datetime
date = datetime.now().strftime('%Y%m%d')
myFileName = f'LH_TBL_FIRST{date}.csv.gz'
print(myFileName)
>>> LH_TBL_FIRST20200908.csv.gz
As for errors in your code:
you declare your cursor as ctx.cursor() and further along you just use cur.execute(...) and cur.close(...). These won't work. Run your code to find the errors and fix them.
Edit suggested by #Lysergic:
If your python version is too old, you could use str.format().
myFileName = 'LH_TBL_FIRST{0}.csv.gz'.format(date)
from datetime import datetime
class FileNameWithDateTime(object):
def __init__(self, fileNameAppender, fileExtension="txt"):
self.fileNameAppender = fileNameAppender
self.fileExtension = fileExtension
def appendCurrentDateTimeInFileName(self,filePath):
currentTime = self.fileNameAppender
print(currentTime.strftime("%Y%m%d"))
filePath+=currentTime.strftime("%Y%m%d")
filePath+="."+self.fileExtension
try:
with open(filePath, "a") as fwrite1:
fwrite1.write(filePath)
except OSError as oserr:
print("Error while writing ",oserr)
I take the following approach
#defining what time/date related values your variable will contain
date_id = (datetime.today()).strftime('%Y%m%d')
Write the output file.
#Creating the filename
with open(date_id + "_" + "LH_TBL.csv.gz" 'w') as gzip:
output: YYYY/MM/DD _ filename
20200908_filename

Passing command line argument to Presto Query

I m a newbie to python. I want to pass a command-line argument to my presto query which is inside a function and then writes the result as a CSV file. But when I try to run it on the terminal it says 'Traceback (most recent call last): File "function2.py", line 3, in <module> from pyhive import presto ModuleNotFoundError: No module named 'pyhive'
The pyhive requirement is already satisfied. Please find attached my code:
from sys import argv
import argparse
from pyhive import presto
import prestodb
import csv
import sys
import pandas as pd
connection = presto.connect(host='xyz',port=8889,username='test')
cur = connection.cursor()
print('Connection Established')
def func1(object,start,end):
object = argv[1]
start = argv[2]
end = argv[3]
result = cur.execute("""
with map_date as
(
SELECT
object,
epoch,
timestamp,
date,
map_agg(name, value) as map_values
from hive.schema.test1
where object = '${object}'
and (epoch >= '${start}' and epoch <= '${end}')
and name in ('x','y')
GROUP BY object,epoch,timestamp,date
order by timestamp asc
)
SELECT
epoch
, timestamp
, CASE WHEN element_at(map_values, 'x') IS NOT NULL THEN map_values['x'] ELSE NULL END AS x
, CASE WHEN element_at(map_values, 'y') IS NOT NULL THEN map_values['y'] ELSE NULL END AS y
, object
, date AS date
from map_date
""")
rows = cur.fetchall()
print('Query Finished') #Returns the list with one entry for each record
fp = open('/Users/xyz/Desktop/Python/function.csv', 'w')
print('File Created')
myFile = csv.writer(fp)
colnames = [desc[0] for desc in cur.description] #store the headers in variable called 'colnames'
myFile.writerow(colnames) #write the header to the file
myFile.writerows(rows)
fp.close()
func1(object,start,end)
cur.close()
connection.close()
How can I pass the command line argument to my Presto query which is written inside a function?
Any help is much appreciated. Thank you In advance!
I only describe how to pass command line arguments to function and query.
If you define function
def func1(object, start, end):
# code
then you have to send values as varaibles and you have to use sys.argv outside function
connection = presto.connect(host='xyz', port=8889, username='test') # PEP8: spaces after commas
cur = connection.cursor()
print('Connection Established')
object_ = sys.argv[1] # PEP8: there is class `object` so I add `_` to create different name
start = sys.argv[2]
end = sys.argv[3]
func1(object_, start, end)
cur.close()
connection.close()
You don't have to use the same names outside function
args1 = sys.argv[1]
args2 = sys.argv[2]
args3 = sys.argv[3]
func1(args1, args2, args3)
and you can even do
func1(sys.argv[1], sys.argv[2], sys.argv[3])
becuse when you run this line then python gets definition def func1(object, start, end): and it creates local variables with names object, start, end inside func1 and it assigns external value to these local variables
object=objec_, start=start, end=end
or
object=args1, start=args2, end=args2
or
object=sys.argv[1], start=sys.argv[1], end=sys.argv[1]
It would be good to send explicitly also cur to function
def func1(cur, object_, start, end):
# code
and
func1(cur, sys.argv[1], sys.argv[2], sys.argv[3])
I don't know what you try to do in SQL query but Python uses {start} (without $) to put value in string (Bash uses ${start}) and it needs prefix f to create f-string - f"""... {start}....""". Without f you have to use normal string formatting """... {start}....""".format(start=start)
import sys
import csv
from pyhive import presto
# --- functions ----
def func1(cur, object_, start, end): # PEP8: spaces after commas
# Python use `{star} {end}`, Bash uses `${start} ${end}`
# String needs prefix `f` to use `{name} {end}` in f-string
# or you have to use `"{start} {end}".format(start=value1, end=value2)`
result = cur.execute(f"""
WITH map_date AS
(
SELECT
object,
epoch,
timestamp,
date,
map_agg(name, value) AS map_values
FROM hive.schema.test1
WHERE object = '{object_}'
AND (epoch >= '{start}' AND epoch <= '{end}')
AND name IN ('x','y')
GROUP BY object,epoch,timestamp,date
ORDER BY timestamp asc
)
SELECT
epoch,
timestamp,
CASE WHEN element_at(map_values, 'x') IS NOT NULL THEN map_values['x'] ELSE NULL END AS x,
CASE WHEN element_at(map_values, 'y') IS NOT NULL THEN map_values['y'] ELSE NULL END AS y,
object,
date AS date
FROM map_date
""")
rows = cur.fetchall()
colnames = [desc[0] for desc in cur.description] # store the headers in variable called 'colnames'
print('Query Finished') # returns the list with one entry for each record
fp = open('/Users/xyz/Desktop/Python/function.csv', 'w')
my_file = csv.writer(fp) # PEP8: lower_case_names for variables
my_file.writerow(colnames) # write the header to the file
my_file.writerows(rows)
fp.close()
print('File Created')
# --- main ---
connection = presto.connect(host='xyz', port=8889, username='test') # PEP8: spaces after commas
cur = connection.cursor()
print('Connection Established')
#object_ = sys.argv[1] # PEP8: there is class `object` so I add `_` to create different name
#start = sys.argv[2]
#end = sys.argv[3]
#func1(cur, object_, start, end)
func1(cur, sys.argv[1], sys.argv[2], sys.argv[3])
cur.close()
connection.close()
If you plan to use argparse
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--object', help='object to search')
parser.add_argument('-s', '--start', help='epoch start')
parser.add_argument('-e', '--end', help='epoch end')
args = parser.parse_args()
and then
func1(cur, args.object, args.start, args.end)
import argparse
# ... imports and functions ...
# --- main ---
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--object', help='object to search')
parser.add_argument('-s', '--start', help='epoch start')
parser.add_argument('-e', '--end', help='epoch end')
#parser.add_argument('-D', '--debug', action='store_true', help='debug (display extra info)')
args = parser.parse_args()
#if args.debug:
# print(args)
connection = presto.connect(host='xyz', port=8889, username='test') # PEP8: spaces after commas
cur = connection.cursor()
print('Connection Established')
func1(cur, args.object, args.start, args.end)
cur.close()

Read a csv file, clean it, then write out the result as a csv using Apache Beam dataflow

I would like to read a csv file, clean it, then write out the result as a csv using Apache Beam dataflow. The purpose is to make the file loadable into BigQuery. The cleaning rule is to simply escape a double quote with a double quote.
My cleaning rule works. I’m having trouble incorporating it into a pipeline. I am seeking advice on what my cleaning function should return and how to call it through the pipeline.
import apache_beam as beam
import csv
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import WriteToText
lines = p | ReadFromText(file_pattern="gs://dev/clean_input/input01.csv")
def parse_method(line):
CSV_PARSING_KWARGS = {
'doublequote': True,
'escapechar': '\\',
'quotechar': '"',
'delimiter': ','
}
reader = csv.reader(csv_file, CSV_PARSING_KWARGS)
for rec in reader:
cw = csv.writer(out_file, escapechar='"', quoting=csv.QUOTE_MINIMAL)
cw.writerow(rec)
return rec
def run(region, project, bucket, temploc ):
argv = [
# Passed in args
'--region={}'.format(region),
'--project={}'.format(project),
'--temp_location={}'.format(temploc),
# Constructs
'--staging_location=gs://{}/clean_input/stg/'.format(bucket),
# Mandatory constants
'--job_name=cleammycsv',
'--runner=DataflowRunner'
]
options = PipelineOptions(
flags=argv
)
pipeline = beam.Pipeline(options=options)
clean_csv = (pipeline
lines = lines| 'Read' >> beam.Map(parse_method)
line = lines | 'Output to file' >> WriteToText(file_pattern="gs://dev/clean_output/output_file.csv")
)
pipeline.run()
if __name__ == '__main__':
import argparse
# Create the parser
parser = argparse.ArgumentParser(description='Run the CSV cleaning pipeline')
parser.add_argument('-r','--region', help='Region ID where data flow job to run', default='australia-southeast1')
parser.add_argument('-p','--project', help='Unique project ID', required=True)
parser.add_argument('-b','--bucket', help='Bucket name', required=True)
parser.add_argument('-t','--temploc', help='Bucket name and folder', required=True)
# Execute the parse_args() method
args = vars(parser.parse_args())
run(project=args['project'], bucket=args['bucket'], region=args['region'],temploc=args['temploc'])
I finally got something working which does the job.
import apache_beam as beam
import csv
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import WriteToText
def parse_file(element):
for line in csv.reader([element], quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL):
line = [s.replace('\"', '') for s in line]
clean_line = '","'.join(line)
final_line = '"'+ clean_line +'"'
return final_line
def run(region, project, bucket, temploc ):
argv = [
# Passed in args
'--region={}'.format(region),
'--project={}'.format(project),
'--temp_location={}'.format(temploc),
# Constructs
'--staging_location=gs://{}/clean_input/stg/'.format(bucket),
# Mandatory constants
'--job_name=cleammycsv',
'--runner=DataflowRunner'
]
filename_in = 'gs://{}/clean_input/IN_FILE.csv'.format(bucket)
files_output = 'gs://{}/clean_output/OUT_FILE.csv'.format(bucket)
options = PipelineOptions(
flags=argv
)
pipeline = beam.Pipeline(options=options)
clean_csv = (pipeline
| 'Read input file' >> beam.io.ReadFromText(filename_in)
| 'Parse file' >> beam.Map(parse_file)
| 'writecsv' >> beam.io.WriteToText(files_output,num_shards=10)
)
pipeline.run()
if __name__ == '__main__':
import argparse
# Create the parser
parser = argparse.ArgumentParser(description='Run the CSV cleaning pipeline')
parser.add_argument('-r','--region', help='Region ID where data flow job to run', required=True)
parser.add_argument('-p','--project', help='Unique project ID', required=True)
parser.add_argument('-b','--bucket', help='Bucket name', required=True)
parser.add_argument('-t','--temploc', help='Bucket name and folder', required=True)
# Execute the parse_args() method
args = vars(parser.parse_args())
run(project=args['project'], bucket=args['bucket'], region=args['region'],temploc=args['temploc'])

Calling Argument in Python code

I have the following code that demands a -p argument when calling. However, how do I call the -p argument in the SQL query? I am also looking to use this -p argument text in the output file name.
#!/usr/bin/python
import argparse
import psycopg2
import csv
parser = argparse.ArgumentParser(description='insert the project ID as an
argument')
parser.add_argument('-p','--project_id', help='project_id to pull files from
ERAPRO',required=True)
args = parser.parse_args()
conn = psycopg2.connect(database="XXX", user="XXX", password="XXX",
host="XXX", port="5432")
cur = conn.cursor()
cur.execute("""SELECT project_analysis.project_accession,
analysis.analysis_accession, file.filename, file.file_md5, file.file_location
FROM project_analysis
LEFT JOIN analysis on project_analysis.analysis_accession = analysis.analysis_accession
LEFT JOIN analysis_file on analysis.analysis_accession = analysis_file.analysis_accession
LEFT JOIN file on analysis_file.file_id = file.file_id
WHERE project_accession = <INSERT -p ARGUMENT HERE> and analysis.hidden_in_eva = '0';""")
records = cur.fetchall()
with open ('/nfs/production3/eva/user/gary/evapro_ftp/<INSERT -p ARGUMENT
HERE>.csv', 'w') as f:
writer = csv.writer (f, delimiter = ',')
for row in records:
writer.writerow(row)
conn.close()
All help appreciated.
Thanks
First assign your argument to variable using dest argument to add_argument(). lets say we assign the input to the project_id variable.
This way we can reference it in the code.
parser.add_argument('-p','--project_id',
help='project_id to pull files from
ERAPRO',
required=True,
dest='project_id') # notice the dest argument
cur.execute("""SELECT project_analysis.project_accession,
analysis.analysis_accession, file.filename, file.file_md5, file.file_location
FROM project_analysis
LEFT JOIN analysis on project_analysis.analysis_accession = analysis.analysis_accession
LEFT JOIN analysis_file on analysis.analysis_accession = analysis_file.analysis_accession
LEFT JOIN file on analysis_file.file_id = file.file_id
WHERE project_accession = %s and analysis.hidden_in_eva = '0';""", (args.project_id))
Notice the use of execute(' ... %s ...', (args.project_id)) by doing this we interpolated the value referenced by project_id into the string.
After calling args = parser.parse_args() you can obtain the value of the arguments like this:
pid = args.project_id
Then you can use that value of pid in your code by using normal string substitution. However, it's better to use psycopg2's inbuilt method for passing parameters to SQL queries to prevent SQL injection.
Normal string substitution:
'hello world {]'.format(var_name)
psycopg2:
cur.execute('SELECT * from %s', (var_name))

Categories

Resources