Passing command line argument to Presto Query - python

I m a newbie to python. I want to pass a command-line argument to my presto query which is inside a function and then writes the result as a CSV file. But when I try to run it on the terminal it says 'Traceback (most recent call last): File "function2.py", line 3, in <module> from pyhive import presto ModuleNotFoundError: No module named 'pyhive'
The pyhive requirement is already satisfied. Please find attached my code:
from sys import argv
import argparse
from pyhive import presto
import prestodb
import csv
import sys
import pandas as pd
connection = presto.connect(host='xyz',port=8889,username='test')
cur = connection.cursor()
print('Connection Established')
def func1(object,start,end):
object = argv[1]
start = argv[2]
end = argv[3]
result = cur.execute("""
with map_date as
(
SELECT
object,
epoch,
timestamp,
date,
map_agg(name, value) as map_values
from hive.schema.test1
where object = '${object}'
and (epoch >= '${start}' and epoch <= '${end}')
and name in ('x','y')
GROUP BY object,epoch,timestamp,date
order by timestamp asc
)
SELECT
epoch
, timestamp
, CASE WHEN element_at(map_values, 'x') IS NOT NULL THEN map_values['x'] ELSE NULL END AS x
, CASE WHEN element_at(map_values, 'y') IS NOT NULL THEN map_values['y'] ELSE NULL END AS y
, object
, date AS date
from map_date
""")
rows = cur.fetchall()
print('Query Finished') #Returns the list with one entry for each record
fp = open('/Users/xyz/Desktop/Python/function.csv', 'w')
print('File Created')
myFile = csv.writer(fp)
colnames = [desc[0] for desc in cur.description] #store the headers in variable called 'colnames'
myFile.writerow(colnames) #write the header to the file
myFile.writerows(rows)
fp.close()
func1(object,start,end)
cur.close()
connection.close()
How can I pass the command line argument to my Presto query which is written inside a function?
Any help is much appreciated. Thank you In advance!

I only describe how to pass command line arguments to function and query.
If you define function
def func1(object, start, end):
# code
then you have to send values as varaibles and you have to use sys.argv outside function
connection = presto.connect(host='xyz', port=8889, username='test') # PEP8: spaces after commas
cur = connection.cursor()
print('Connection Established')
object_ = sys.argv[1] # PEP8: there is class `object` so I add `_` to create different name
start = sys.argv[2]
end = sys.argv[3]
func1(object_, start, end)
cur.close()
connection.close()
You don't have to use the same names outside function
args1 = sys.argv[1]
args2 = sys.argv[2]
args3 = sys.argv[3]
func1(args1, args2, args3)
and you can even do
func1(sys.argv[1], sys.argv[2], sys.argv[3])
becuse when you run this line then python gets definition def func1(object, start, end): and it creates local variables with names object, start, end inside func1 and it assigns external value to these local variables
object=objec_, start=start, end=end
or
object=args1, start=args2, end=args2
or
object=sys.argv[1], start=sys.argv[1], end=sys.argv[1]
It would be good to send explicitly also cur to function
def func1(cur, object_, start, end):
# code
and
func1(cur, sys.argv[1], sys.argv[2], sys.argv[3])
I don't know what you try to do in SQL query but Python uses {start} (without $) to put value in string (Bash uses ${start}) and it needs prefix f to create f-string - f"""... {start}....""". Without f you have to use normal string formatting """... {start}....""".format(start=start)
import sys
import csv
from pyhive import presto
# --- functions ----
def func1(cur, object_, start, end): # PEP8: spaces after commas
# Python use `{star} {end}`, Bash uses `${start} ${end}`
# String needs prefix `f` to use `{name} {end}` in f-string
# or you have to use `"{start} {end}".format(start=value1, end=value2)`
result = cur.execute(f"""
WITH map_date AS
(
SELECT
object,
epoch,
timestamp,
date,
map_agg(name, value) AS map_values
FROM hive.schema.test1
WHERE object = '{object_}'
AND (epoch >= '{start}' AND epoch <= '{end}')
AND name IN ('x','y')
GROUP BY object,epoch,timestamp,date
ORDER BY timestamp asc
)
SELECT
epoch,
timestamp,
CASE WHEN element_at(map_values, 'x') IS NOT NULL THEN map_values['x'] ELSE NULL END AS x,
CASE WHEN element_at(map_values, 'y') IS NOT NULL THEN map_values['y'] ELSE NULL END AS y,
object,
date AS date
FROM map_date
""")
rows = cur.fetchall()
colnames = [desc[0] for desc in cur.description] # store the headers in variable called 'colnames'
print('Query Finished') # returns the list with one entry for each record
fp = open('/Users/xyz/Desktop/Python/function.csv', 'w')
my_file = csv.writer(fp) # PEP8: lower_case_names for variables
my_file.writerow(colnames) # write the header to the file
my_file.writerows(rows)
fp.close()
print('File Created')
# --- main ---
connection = presto.connect(host='xyz', port=8889, username='test') # PEP8: spaces after commas
cur = connection.cursor()
print('Connection Established')
#object_ = sys.argv[1] # PEP8: there is class `object` so I add `_` to create different name
#start = sys.argv[2]
#end = sys.argv[3]
#func1(cur, object_, start, end)
func1(cur, sys.argv[1], sys.argv[2], sys.argv[3])
cur.close()
connection.close()
If you plan to use argparse
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--object', help='object to search')
parser.add_argument('-s', '--start', help='epoch start')
parser.add_argument('-e', '--end', help='epoch end')
args = parser.parse_args()
and then
func1(cur, args.object, args.start, args.end)
import argparse
# ... imports and functions ...
# --- main ---
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--object', help='object to search')
parser.add_argument('-s', '--start', help='epoch start')
parser.add_argument('-e', '--end', help='epoch end')
#parser.add_argument('-D', '--debug', action='store_true', help='debug (display extra info)')
args = parser.parse_args()
#if args.debug:
# print(args)
connection = presto.connect(host='xyz', port=8889, username='test') # PEP8: spaces after commas
cur = connection.cursor()
print('Connection Established')
func1(cur, args.object, args.start, args.end)
cur.close()

Related

Python for/while loop

Today i am working on a project about incoming phone calls being transcripted and getting saved into text files, but i am also kinda new to python and python loops.
I want to loop over a SQL server column and let each row loop trough the azure Speech to text service i use (all of the phonecall OID's). I have been stuck on this problem for a couple days now so i thought i might find some help here.
import azure.cognitiveservices.speech as speechsdk
import time
from os import path
from pydub import AudioSegment
import requests
import hashlib
import sys
import os.path
import pyodbc
databaseName = '*'
username = '*'
password = '*'
server = '*'
driver = '*'
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
while row:
array = [(int(row[1]))]
row = cursor.fetchone()
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
Telefoongesprek = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
with open("Telefoongesprek.mp3", "wb") as f:
f.write(Telefoongesprek.content)
src = "Telefoongesprek.mp3"
dst = "Telefoongesprek.wav"
sound = AudioSegment.from_file(src)
sound.export(dst, format="wav")
def speech_recognize_continuous_from_file():
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(filename="Telefoongesprek.wav")
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
speech_recognize_continuous_from_file()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
everything works apart form each other but i just dont know how to place the loop and witch one i should use (For/While loop). right here im trying to loop over an array but i dont this this is correct.
Error message: Decoding failed. ffmpeg returned error code: 1
[mp3 # 000001cb8c57e0o0] Failed to read frame size: could not seek to 1073.
which i am pretty sure means that my azure function can't find an mp3 file, what means that the "Mp3 to Wav" convert doesn't work.
Thanks in advance!
If I understand your question, you have a database with lots of phone call details. One of the field value in each row is used to create the associated mp3 file. You want to do speech to text using azure on each of the mp3 file you have in your database.
So you can do it in two ways:
Iterate though all rows in the database and create all the associted files into a folder in the local disk with the OID as your filename.
Then write another loop to iterate through this folder and send the files for transcription to Azure Speech to Text service.
The other technique is to do everything in a single loop like the way you have shown which will require some corrections.
Ok, so now that part is clear, we can go into the speech to text part. So azure allow you to send the compressed format for transcription, which means you actually don't need to convert it into wav file.
Please have a look at the modified code below with the changes:
# code snippet borrowed from azure samples
def speech_recognize_continuous_from_file(filename):
class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
def __init__(self, filename: str):
super().__init__()
self._file_h = open(filename, "rb")
def read(self, buffer: memoryview) -> int:
try:
size = buffer.nbytes
frames = self._file_h.read(size)
buffer[:len(frames)] = frames
return len(frames)
except Exception as ex:
print('Exception in `read`: {}'.format(ex))
raise
def close(self) -> None:
print('closing file')
try:
self._file_h.close()
except Exception as ex:
print('Exception in `close`: {}'.format(ex))
raise
# Creates an audio stream format. For an example we are using MP3 compressed file here
compressed_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=speechsdk.AudioStreamContainerFormat.MP3)
callback = BinaryFileReaderCallback(filename=filename)
stream = speechsdk.audio.PullAudioInputStream(stream_format=compressed_format, pull_stream_callback=callback)
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# Creates a speech recognizer using a file as audio input, also specify the speech language
speech_recognizer = speechsdk.SpeechRecognizer(speech_config, audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
# loop through the rows
while row:
array = [(int(row[1]))]
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
telefoongesprek_response = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
# save the file to local disk as mp3
with open("Telefoongesprek.mp3", "wb") as f:
f.write(telefoongesprek_response.content)
# do the speech to text on the mp3 file
speech_recognize_continuous_from_file(f.name)
# fetch the next row
row = cursor.fetchone()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
I haven't tested this full code as i don't have the db connections with me. Please fell free to modify for your use case and let me know if you have any issues.

How to add current date in filename in python script

I'm trying to unload data from snowflakes to GCS, for that I'm using snowflakepython connector and python script. In the below python script in the file name 'LH_TBL_FIRST20200908' if the script runs today then the name will be same, if the script runs tomorrow then the file name should be 'LH_TBL_FIRST20200909' similarly if it runs day after then 'LH_TBL_FIRST202009010'.
Also please tell me if the code has any mistakes in it. Code is below
import snowflake.connector
# Gets the version
ctx = snowflake.connector.connect(
user='*****',
password='*******',
account='********',
warehouse='*******',
database='********',
schema='********'
)
cs = ctx.cursor()
sql = "copy into #unload_gcs/LH_TBL_FIRST20200908.csv.gz
from ( select * from TEST_BASE.LH_TBL_FIRST )
file_format =
( type=csv compression='gzip'
FIELD_DELIMITER = ','
field_optionally_enclosed_by='"'
NULL_IF=()
EMPTY_FIELD_AS_NULL = FALSE
)
single = fals
e max_file_size=5300000000
header = false;"
cur.execute(sql)
cur.close()
conn.close()
You can use f-strings to fill in (part of) your filename. Python has the datetime module to handle dates and times.
from datetime import datetime
date = datetime.now().strftime('%Y%m%d')
myFileName = f'LH_TBL_FIRST{date}.csv.gz'
print(myFileName)
>>> LH_TBL_FIRST20200908.csv.gz
As for errors in your code:
you declare your cursor as ctx.cursor() and further along you just use cur.execute(...) and cur.close(...). These won't work. Run your code to find the errors and fix them.
Edit suggested by #Lysergic:
If your python version is too old, you could use str.format().
myFileName = 'LH_TBL_FIRST{0}.csv.gz'.format(date)
from datetime import datetime
class FileNameWithDateTime(object):
def __init__(self, fileNameAppender, fileExtension="txt"):
self.fileNameAppender = fileNameAppender
self.fileExtension = fileExtension
def appendCurrentDateTimeInFileName(self,filePath):
currentTime = self.fileNameAppender
print(currentTime.strftime("%Y%m%d"))
filePath+=currentTime.strftime("%Y%m%d")
filePath+="."+self.fileExtension
try:
with open(filePath, "a") as fwrite1:
fwrite1.write(filePath)
except OSError as oserr:
print("Error while writing ",oserr)
I take the following approach
#defining what time/date related values your variable will contain
date_id = (datetime.today()).strftime('%Y%m%d')
Write the output file.
#Creating the filename
with open(date_id + "_" + "LH_TBL.csv.gz" 'w') as gzip:
output: YYYY/MM/DD _ filename
20200908_filename

Calling Argument in Python code

I have the following code that demands a -p argument when calling. However, how do I call the -p argument in the SQL query? I am also looking to use this -p argument text in the output file name.
#!/usr/bin/python
import argparse
import psycopg2
import csv
parser = argparse.ArgumentParser(description='insert the project ID as an
argument')
parser.add_argument('-p','--project_id', help='project_id to pull files from
ERAPRO',required=True)
args = parser.parse_args()
conn = psycopg2.connect(database="XXX", user="XXX", password="XXX",
host="XXX", port="5432")
cur = conn.cursor()
cur.execute("""SELECT project_analysis.project_accession,
analysis.analysis_accession, file.filename, file.file_md5, file.file_location
FROM project_analysis
LEFT JOIN analysis on project_analysis.analysis_accession = analysis.analysis_accession
LEFT JOIN analysis_file on analysis.analysis_accession = analysis_file.analysis_accession
LEFT JOIN file on analysis_file.file_id = file.file_id
WHERE project_accession = <INSERT -p ARGUMENT HERE> and analysis.hidden_in_eva = '0';""")
records = cur.fetchall()
with open ('/nfs/production3/eva/user/gary/evapro_ftp/<INSERT -p ARGUMENT
HERE>.csv', 'w') as f:
writer = csv.writer (f, delimiter = ',')
for row in records:
writer.writerow(row)
conn.close()
All help appreciated.
Thanks
First assign your argument to variable using dest argument to add_argument(). lets say we assign the input to the project_id variable.
This way we can reference it in the code.
parser.add_argument('-p','--project_id',
help='project_id to pull files from
ERAPRO',
required=True,
dest='project_id') # notice the dest argument
cur.execute("""SELECT project_analysis.project_accession,
analysis.analysis_accession, file.filename, file.file_md5, file.file_location
FROM project_analysis
LEFT JOIN analysis on project_analysis.analysis_accession = analysis.analysis_accession
LEFT JOIN analysis_file on analysis.analysis_accession = analysis_file.analysis_accession
LEFT JOIN file on analysis_file.file_id = file.file_id
WHERE project_accession = %s and analysis.hidden_in_eva = '0';""", (args.project_id))
Notice the use of execute(' ... %s ...', (args.project_id)) by doing this we interpolated the value referenced by project_id into the string.
After calling args = parser.parse_args() you can obtain the value of the arguments like this:
pid = args.project_id
Then you can use that value of pid in your code by using normal string substitution. However, it's better to use psycopg2's inbuilt method for passing parameters to SQL queries to prevent SQL injection.
Normal string substitution:
'hello world {]'.format(var_name)
psycopg2:
cur.execute('SELECT * from %s', (var_name))

How to JSON dump to a rotating file object

I'm writing a program which periodically dumps old data from a RethinkDB database into a file and removes it from the database. Currently, the data is dumped into a single file which grows without limit. I'd like to change this so that the maximum file size is, say, 250 Mb, and the program starts to write to a new output file just before this size is exceeded.
It seems like Python's RotatingFileHandler class for loggers does approximately what I want; however, I'm not sure whether logging can be applied to any JSON-dumpable object or just to strings.
Another possible approach would be to use (a variant of) Mike Pennington's
RotatingFile class (see python: outfile to another text file if exceed certain file size).
Which of these approaches is likely to be the most fruitful?
For reference, my current program is as follows:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
class Controller(RethinkDB):
def __init__(self, db_address=(os.environ['DB_ADDR'], int(os.environ['DB_PORT'])), db_name=os.environ['DB_NAME']):
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the IperCronComponent with the default logger name (in this case, "Controller")
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable in RethinkDBMixIn
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in r.table(self.db_table).index_list().run(self.db): # If "timestamp" is a secondary index
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (i.e., the year 1400 in the UTC timezone)
data_to_archive = r.table(self.db_table).between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster)
else:
data_to_archive = r.table(self.db_table).filter(r.row['timestamp'] < expiry_time) # Generate the same query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None, output_file="archived_sensor_data.json"):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON
with open(output_file, 'a') as f:
ids_to_delete = []
for item in old_data:
print item
# msgpack.dump(item, f)
json.dump(item, f)
f.write('\n') # Separate each document by a new line
ids_to_delete.append(item['id'])
r.table(self.db_table).get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for loop.
def test_job_1():
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
conn = r.connect("localhost", 28015 + port_offset)
r.db(db_name).table(table_name).delete().run(conn)
import rethinkdb_add_data
controller = Controller(db_address=("localhost", 28015+port_offset))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query())
return archiving_job
if __name__ == "__main__":
archiving_job = test_job_1()
schedule.every(0.1).minutes.do(archiving_job)
while True:
schedule.run_pending()
It is not completely 'runnable' from the part shown, but the key point is that I would like to replace the line
json.dump(item, f)
with a similar line in which f is a rotating, and not fixed, file object.
Following Stanislav Ivanov, I used json.dumps to convert each RethinkDB document to a string and wrote this to a RotatingFileHandler:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
import logging
from logging.handlers import RotatingFileHandler
from random_data_generator import RandomDataGenerator
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
os.environ['DB_ADDR'] = 'localhost'
os.environ['DB_PORT'] = '28015'
os.environ['DB_NAME'] = 'ipercron'
class Controller(RethinkDB):
def __init__(self, db_address=None, db_name=None):
if db_address is None:
db_address = (os.environ['DB_ADDR'], int(os.environ['DB_PORT'])) # The default host ("rethinkdb") and port (28015) are stored as environment variables
if db_name is None:
db_name = os.environ['DB_NAME'] # The default database is "ipercron" and is stored as an environment variable
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the instance of the RethinkDB class. IperCronComponent will be initialized with its default logger name (in this case, "Controller")
self.db_name = db_name
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable of RethinkDBMixIn
self.table = r.db(self.db_name).table(self.db_table)
self.archiving_logger = logging.getLogger("archiving_logger")
self.archiving_logger.setLevel(logging.DEBUG)
self.archiving_handler = RotatingFileHandler("archived_sensor_data.log", maxBytes=2000, backupCount=10)
self.archiving_logger.addHandler(self.archiving_handler)
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in self.table.index_list().run(self.db):
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (namely, the year 1400 in UTC)
data_to_archive = self.table.between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster, requires "timestamp" to be a secondary index)
else:
data_to_archive = self.table.filter(r.row['timestamp'] < expiry_time) # Generate query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON or msgpack
ids_to_delete = []
for item in old_data:
print item
self.dump(item)
ids_to_delete.append(item['id'])
self.table.get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for-loop.
def dump(self, item, mode='json'):
if mode == 'json':
dump_string = json.dumps(item)
elif mode == 'msgpack':
dump_string = msgpack.packb(item)
self.archiving_logger.debug(dump_string)
def populate_database(db_name, table_name, conn):
if db_name not in r.db_list().run(conn):
r.db_create(db_name).run(conn) # Create the database if it does not yet exist
if table_name not in r.db(db_name).table_list().run(conn):
r.db(db_name).table_create(table_name).run(conn) # Create the table if it does not yet exist
r.db(db_name).table(table_name).delete().run(conn) # Empty the table to start with a clean slate
# Generate random data with timestamps uniformly distributed over the past 6 days
random_data_time_interval = timedelta(days=6)
start_random_data = datetime.utcnow().replace(tzinfo=pytz.utc) - random_data_time_interval
random_generator = RandomDataGenerator(seed=0)
packets = random_generator.packets(N=100, start=start_random_data)
# print packets
print "Adding data to the database..."
r.db(db_name).table(table_name).insert(packets).run(conn)
if __name__ == "__main__":
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
host = "localhost"
port = 28015 + port_offset
conn = r.connect(host, port) # RethinkDB connection object
populate_database(db_name, table_name, conn)
# import rethinkdb_add_data
controller = Controller(db_address=(host, port))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query()) # This ensures that the query is only generated once. (This is sufficient since r.now() is re-evaluated every time a connection is made).
schedule.every(0.1).minutes.do(archiving_job)
while True:
schedule.run_pending()
In this context the RethinkDB class does little other than define the class variable SENSOR_DATA_TABLE and the RethinkDB connection, self.db = r.connect(self.address[0], self.address[1]). This is run together with a module for generating fake data, random_data_generator.py:
import random
import faker
from datetime import datetime, timedelta
import pytz
import rethinkdb as r
class RandomDataGenerator(object):
def __init__(self, seed=None):
self._seed = seed
self._random = random.Random()
self._random.seed(seed)
self.fake = faker.Faker()
self.fake.random.seed(seed)
def __getattr__(self, x):
return getattr(self._random, x)
def name(self):
return self.fake.name()
def datetime(self, start=None, end=None):
if start is None:
start = datetime(2000, 1, 1, tzinfo=pytz.utc) # Jan 1st 2000
if end is None:
end = datetime.utcnow().replace(tzinfo=pytz.utc)
if isinstance(end, datetime):
dt = end - start
elif isinstance(end, timedelta):
dt = end
assert isinstance(dt, timedelta)
random_dt = timedelta(microseconds=self._random.randrange(int(dt.total_seconds() * (10 ** 6))))
return start + random_dt
def packets(self, N=1, start=None, end=None):
return [{'name': self.name(), 'timestamp': self.datetime(start=start, end=end)} for _ in range(N)]
When I run controller it produces several rolled-over output logs, each at most 2 kB in size, as expected:

Copy parameters into list

I am trying to copy parameters passed into a python script to a file. Here is the parameters.
["0013","1","1","\"john.dow#gmail.com\"","1","P123-ND 10Q","10Q H??C"]
I understand that there is a buffer problem and I am getting bad data into my parameters. However, I do not have control over what is being passed in. I am trying to copy, starting at the 5th parameter, the parameters into a file.
f = open(in_file_name, 'w')
for x in range(5, len(arg_list)):
f.write(arg_list[x] + '\n')
f.close()
The result of the file is below:
P123-ND 10Q
10Q H??C
Here is what it should be:
P123-ND
10Q
How can I not include the bad data? What is happening to the spaces between the valid information and the bad information?
As requested, here is the full program:
#!/bin/python
class Argument_Indices:
PRINTER_INDEX = 0
AREA_INDEX = 1
LABEL_INDEX = 2
EMAIL_INDEX = 3
RUN_TYPE_INDEX = 4
import argparse
import json
import os
from subprocess import call
import sys
from time import strftime
def _handle_args():
''' Setup and run argpars '''
parser = argparse.ArgumentParser(description='Set environment variables for and to call Program')
parser.add_argument('time_to_run', default='NOW', choices=['NOW', 'EOP'], help='when to run the report')
parser.add_argument('arguments', nargs='+', help='the remaining command line arguments')
return parser.parse_args()
def _proces_program(arg_list):
time_stamp = strftime("%d_%b_%Y_%H_%M_%S")
printer = arg_list[Argument_Indices.PRINTER_INDEX]
area = arg_list[Argument_Indices.AREA_INDEX]
label = arg_list[Argument_Indices.LABEL_INDEX]
in_file_name = "/tmp/program{0}.inp".format(time_stamp)
os.environ['INPUT_FILE'] = in_file_name
f = open(in_file_name, 'w')
for x in range(5, len(arg_list)):
f.write(arg_list[x])
f.close()
call(['./Program.bin', printer, area, label])
os.remove(in_file_name)
def main():
''' Main Function '''
arg_list = None
args = _handle_args()
if len(args.arguments) < 1:
print('Missing name of input file')
return -1
with open(args.arguments[0]) as input_file:
arg_list = json.load(input_file)
_process_program(arg_list)
return 0
if __name__ == '__main__':
if main() != 0:
print('Program run failed')
sys.exit()
For your exact case (where you're getting duplicated parameters received with some spaces in between) this would work:
received_param_list = ["0013","1","1","\"john.dow#gmail.com\"","1","P123-ND 10Q","10Q H??C"]
arg_list = [i.split(" ")[0] for i in received_param_list]
last_param = received_param_list[-1].split()[-1]
if last_param != arg_list[-1]:
arg_list.append(last_param)
for x in range(5, len(arg_list)):
print (arg_list[x])
Although there might be another simpler way

Categories

Resources