This Non-Threaded script unexpectedly runs faster than the Threaded version - python

I have a python script which validates data fetched from some rows in a database and then logs the errors in a different table in the same database.
The script validates each row and marks it as validated & has error = True/False depending on the validation outcome. This process is repeated for each row. With that, I thought I'd add some steroids by creating threads such that the validation for each row is done by independent threads thus reducing the time it takes to validate a batch of rows.
To my surprise, I find that the threaded script is taking slightly longer than the non-threaded one. On average to validate 1502 rows of data it takes the Non-Threaded script 1.5 seconds while the threaded script takes 2.27 seconds. That might not be much but ideally I'll be running through 2 million records at a go so that time overhead will be significant. That plus I would assume that threaded apps would finish faster! :-)
The two scripts clock the same time of about 0.01 seconds upto the point of creating threads. By this point the SQLAlchemy session is created and all the data to be validated and relations i.e foreign keys etc are fetched. From there though, the non-threaded script finishes faster. Below is my code.
1.0 None-Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
no_of_records += 1
validator = Validate()
validator.validator(no_of_records, data_list)
record_counter += lf_detail_counter
data_list = None
no_of_records = 0
print("Validated '%s': seconds %s" %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Reset counters
lf_detail_counter = 0
lf_counter += 1
#Commit The Entire Transaction.
session.commit()
print("NoThread:Finished validating %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()- process_start_time))
1.1. Validation Function for Non-Threaded Script
class Validate():
has_error = None
def validator(self, loop_length, job):
'''Validate data'''
for row_counter in range(loop_length):
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job[row_counter]
error_detail = ErrorLogDetail()
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 2"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
else:
#Continue with extra validation
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
2.0 Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
queue_size += 1
if queue_size == THREAD_LIMIT:
myqueuing(queue_size, data_list)
queue_size = 0
#spawn a pool of threads, and pass them queue instance
if queue_size > 0:
myqueuing(queue_size, data_list)
#Keep record of rows processed
record_counter += lf_detail_counter
print("Validated '%s': seconds- %s " %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Commit The Entire Transaction.
session.commit()
#Reset counters
lf_detail_counter = 0
lf_counter += 1
data_list = None
queue_size = 0
print("HasThread:Finished loading %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()-process_start_time)) #print time it took to run'
2.1. Threaded Validation Function
THREAD_LIMIT = 50 # This is how many threads we want
jobs = queue.Queue() # This sets up the queue object to use 5 slots
singlelock = threading.Lock() # This is a lock so threads don't print trough each other (and other reasons)
def myqueuing(queuesize, data):
'''Put the fetched data in a queue and instantiate threads to
process the queue'''
# Spawn the threads
is_valid_date("20131212", True) #Calling this here to avoid a bug in time.striptime() when threading
for x in range(queuesize):
# This is the thread class that we instantiate.
workerbee().start()
# Put stuff in queue
for i in range(queuesize):
# Block if queue is full, and wait 2 seconds. After 5s raise Queue Full error.
try:
jobs.put(data[i], block=True, timeout=2)
except:
singlelock.acquire()
print ("The queue is full !")
singlelock.lock.release()
# Wait for the threads to finish
singlelock.acquire() # Acquire the lock so we can print
print ("Waiting for threads to finish.")
singlelock.release() # Release the lock
jobs.join() # This command waits for all threads to finish.
class workerbee(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.lock = threading.Lock()
self.has_error = False
def run(self):
#try:
job = jobs.get(True,1)
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job
'''Validates the data.'''
error_detail = ErrorLogDetail()
#Again please note that this part is identical for both the non-threaded and the threaded script.
#After each pass on a record, the record is marked as validated and if has_error = True
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message2")
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
else:
#Continue with further validation - about 5 other validation checks
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
3.0. Common function for setting validation in both threaded and non-threaded
def set_validation(self, load_file_detail, load_file_detail_id, has_error, can_be_loaded):
'''Mark the record as having been validated and whether has error = True or False'''
#print("haserror and canbeloaded ", has_error, can_be_loaded)
is_done = load_file_detail.set_validation_and_error(load_file_detail_id, True, has_error, can_be_loaded)
if is_done == False:
raise Exception ("Can't update load_file_detail's is_validated parameter: ", load_file_detail_id)
3.1. Actual SQLAlchemy session for saving the validation status
def set_validation_and_error(self, load_file_detail_id, is_validated, has_error, can_be_loaded):
result = session.execute('UPDATE load_file_detail SET is_validated=%s, has_error=%s, can_be_loaded=%s WHERE id=%s' \
%(is_validated, has_error, can_be_loaded, load_file_detail_id))
So, the fetching of data to be validated is the same and both scripts take same amount of time up to that point. The validation process is the same for both scripts and saving to DB is the same i.e. Section 3.0 and 3.1 are shared by both scripts. The only difference is the validation with multiple threads. So am thinking maybe there is something about the multiple threads and SQLAlchemy that is making the app slower in threaded mode? Have I implemented the threaded function in the proper way? One of those or threading is not suitable in this scenario. Suggestions welcome.

You must create Queue for logging and add "logger" thread. So you remove locks code must be faster.
Also create DB connections in each thread to be able to get data in parallel.
Treads parallelize only C-library calls because of GIL.
For parallelize python code You must use multiprocessing.
I write test for You, describing how to process iterable:
def produce_data(amount=100000, invalid=1, valid=10):
# produce_data = sql('request').getall()
import random
id = 0
data = [True]*valid + [False]*invalid
while id < amount:
id+=1
yield (id,random.choice(data))
def validate(row):
if row[1]:
time.sleep(0.001) #set valid sql request emulation.
return True
else:
time.sleep(0.001) #set invalid sql request emulation.
return False
def single():
for row in produce_data():
validate(row)
def targeted():
import threading
for row in produce_data():
threading.Thread(target=validate,args=(row,))
Uley = 50
class Bee(object):
error=False
running = True
def __init__(self,queue,*args,**kwargs):
self.queue=queue #dont use any global variable!
# every bee must have unique db connection and session.
#self.session = db.connection().session()
# initialize it there.
return super(Bee,self).__init__(*args,**kwargs)
def run(self):
while self.running:
data=self.queue.get()
if data:
self.error = validate(data) # refactor it to self.validate(data) to be able to get cursor from self.session.
self.queue.task_done()
else:
self.queue.task_done()
break
#self.session.commit()
def treaded():
import threading,Queue
class TreadedBee(Bee,threading.Thread): pass
q = Queue.Queue()
for i in range(Uley): #bees started before data was provided.
bee=TreadedBee(q)
bee.daemon = True
bee.start()
for row in produce_data(): #you dont need to get all data to begin processing, at this place must be cursor of response.
q.put(row)
q.join()
for i in range(Uley):
q.put(None)
def forked():
from multiprocessing import Process,JoinableQueue
class ForkedBee(Bee,Process): pass
q = JoinableQueue()
for i in range(Uley):
bee=ForkedBee(q)
bee.start()
for row in produce_data():
q.put(row)
q.join()
#at this you need to kill zomBee -)
for i in range(Uley):
q.put(None)
q.close()
def pool():
from multiprocessing import Pool
pool = Pool(processes=Uley)
pool.map(validate,produce_data())
if __name__ == "__main__":
import time
s=time.time()
single()
print(time.time()-s) #109
s=time.time()
single()
print(time.time()-s) #6
s=time.time()
treaded()
print(time.time()-s) #12
s=time.time()
forked()
print(time.time()-s) #6
s=time.time()
pool()
print(time.time()-s) #4
test result:
$ python2 tgreads.py
109.779700994
5.84457302094
12.3814198971
5.97618508339
3.69856286049
targeted will flood CPU, memory and you cant provide individual connections to DB, using shared connection is not safe. If want to go in this way - you need to provide output queue and realize collector, that will communicate with DB. pool is short-code and fastest, but not friendly to initiate per-worker connections.

Related

Multiprocessing In Django Function

Is it possible to use multi processing in Django on a request.
#so if I send a request to http://127.0.0.1:8000/wallet_verify
def wallet_verify(request):
walelts = botactive.objects.all()
#here I check if the user want to be included in the process or not so if they set it to True then i'll include them else ignore.
for active in walelts:
check_active = active.active
if check_active == True:
user_is_active = active.user
#for the ones that want to be included I then go to get their key data.
I need to get both api and secret so then I loop through to get the data from active users.
database = Bybitapidatas.objects.filter(user=user_is_active)
for apikey in database:
apikey = apikey.apikey
for apisecret in database:
apisecret = apisecret.apisecret
#since I am making a request to an exchange endpoint I can only include one API and secret at a time . So for 1 person at a time this is why I want to run in parallel.
for a, b in zip(list(Bybitapidatas.objects.filter(user=user_is_active).values("apikey")), list(Bybitapidatas.objects.filter(user=user_is_active).values("apisecret"))):
session =spot.HTTP(endpoint='https://api-testnet.bybit.com/', api_key=a['apikey'], api_secret=b['apisecret'])
#here I check to see if they have balance to open trades if they have selected to be included.
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
for i in GET_USDT_BALANCE:
if 'USDT' in i.values():
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
idx_USDT = GET_USDT_BALANCE.index(i)
GET_USDTBALANCE = session.get_wallet_balance()['result']['balances'][idx_USDT]['free']
print(round(float(GET_USDTBALANCE),2))
#if they don't have enough balance I skip the user.
if round(float(GET_USDTBALANCE),2) < 11 :
pass
else:
session.place_active_order(
symbol="BTCUSDT",
side="Buy",
type="MARKET",
qty=10,
timeInForce="GTC"
)
How can I run this process in parallel while looping through the database to also get data for each individual user.
I am still new to coding so hope I explained that it makes sense.
I have tried multiprocessing and pools but then I get that the app has not started yet and I have to run it outside of wallet_verify is there a way to do it in wallet_verify
and when I send the Post Request.
Any help appreciated.
Filtering the Database to get Users who have set it to True
Listi - [1,3](these are user ID's Returned
processess = botactive.objects.filter(active=True).values_list('user')
listi = [row[0] for row in processess]
Get the Users from the listi and perform the action.
def wallet_verify(listi):
# print(listi)
database = Bybitapidatas.objects.filter(user = listi)
print("---------------------------------------------------- START")
for apikey in database:
apikey = apikey.apikey
print(apikey)
for apisecret in database:
apisecret = apisecret.apisecret
print(apisecret)
start_time = time.time()
session =spot.HTTP(endpoint='https://api-testnet.bybit.com/', api_key=apikey, api_secret=apisecret)
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
for i in GET_USDT_BALANCE:
if 'USDT' in i.values():
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
idx_USDT = GET_USDT_BALANCE.index(i)
GET_USDTBALANCE = session.get_wallet_balance()['result']['balances'][idx_USDT]['free']
print(round(float(GET_USDTBALANCE),2))
if round(float(GET_USDTBALANCE),2) < 11 :
pass
else:
session.place_active_order(
symbol="BTCUSDT",
side="Buy",
type="MARKET",
qty=10,
timeInForce="GTC"
)
print ("My program took", time.time() - start_time, "to run")
print("---------------------------------------------------- END")
return HttpResponse("Wallets verified")
Verifyt is what I use for the multiprocessing since I don't want it to run without being requested to run. also initialiser starts apps for each loop
def verifyt(request):
with ProcessPoolExecutor(max_workers=4, initializer=django.setup) as executor:
results = executor.map(wallet_verify, listi)
return HttpResponse("done")
```

Fetching data in realtime from database in python

I have a class for multiprocessing in Python which creates 3 different processes. First process is for checking if there is any signal from my hardware and pushing it into a Queue, second process is for getting the data out of the Queue and pushing it into a database and the third processes is for getting the data out of the database and pushing it on a server.
obj = QE()
stdFunct = standardFunctions()
watchDogProcess = multiprocessing.Process(target=obj.watchDog)
watchDogProcess.start()
pushToDBSProcess = multiprocessing.Process(target=obj.pushToDBS)
pushToDBSProcess.start()
pushToCloud = multiprocessing.Process(target=stdFunct.uploadCycleTime)
pushToCloud.start()
watchDogProcess.join()
pushToDBSProcess.join()
pushToCloud.join()
My first two processes are running perfectly as desired, however I am struggling with the third process. The following is the code of my third process :
def uploadCycleTime(self):
while True:
uploadCycles = []
lastUpPointer = "SELECT id FROM lastUploaded"
lastUpPointer = self.dbFetchone(lastUpPointer)
lastUpPointer = lastUpPointer[0]
# print("lastUploaded :"+str(lastUpPointer))
cyclesToUploadSQL = "SELECT id,machineId,startDateTime,endDateTime,type FROM cycletimes WHERE id > "+str(lastUpPointer)
cyclesToUpload = self.dbfetchMany(cyclesToUploadSQL,15)
cyclesUploadLength = len(cyclesToUpload)
if(cyclesUploadLength>0):
for cycles in cyclesToUpload:
uploadCycles.append({"dataId":cycles[0],"machineId":cycles[1],"startDateTime":cycles[2].strftime('%Y-%m-%d %H:%M:%S.%f'),"endDateTime":cycles[3].strftime('%Y-%m-%d %H:%M:%S.%f'),"type":cycles[4]})
# print("length : "+str(cyclesUploadLength))
lastUpPointer = uploadCycles[cyclesUploadLength-1]["dataId"]
uploadCycles = json.dumps(uploadCycles)
api = self.dalUrl+"/cycle-times"
uploadResponse = self.callPostAPI(api,str(uploadCycles))
print(lastUpPointer)
changePointerSQL = "UPDATE lastUploaded SET id="+str(lastUpPointer)
try:
changePointerSQL = self.dbAbstraction(changePointerSQL)
except Exception as errorPointer:
print("Pointer change Error : "+str(errorPointer))
time.sleep(2)
Now I am saving a pointer to remember the last id uploaded, and from there on keep uploading 15 packets. When there is data existing in the DB the code works well, however if there is no existing when the process is initiated and data is sent afterwards then it fails to fetch the data from the DB.
I tried printing the length in realtime, it keeps giving me 0, inspite of data being continuously pushed into the DB in real-time.
In my upload process, I missed out on a commit()
def dbFetchAll(self,dataString):
# dataToPush = self.cycletimeQueue.get()
# print(dataToPush)
dbTry = 1
try:
while(dbTry == 1): # This while is to ensure the data has been pushed
sql = dataString
self.conn.execute(sql)
response = self.conn.fetchall()
dbTry = 0
return response
# print(self.conn.rowcount, "record inserted.")
except Exception as error:
print ("Error : "+str(error))
return dbTry
***finally:
self.mydb.commit()***

asynchroneous error handling and response processing of an unbounded list of tasks using zeep

So here is my use case:
I read from a database rows containing information to make a complex SOAP call (I'm using zeep to do these calls).
One row from the database corresponds to a request to the service.
There can be up to 20 thousand lines, so I don't want to read everything in memory before making the calls.
I need to process the responses - when the
response is OK, I need to store some returned information back into
my database, and when there is an exception I need to process the
exception for that particular request/response pair.
I need also to capture some external information at the time of the request creation, so that I know where to store the response from the request. In my current code I'm using the delightful property of gather() that makes the results come in the same order.
I read the relevant PEPs and Python documentation but I'm still very confused, as there seems to be multiple ways to solve the same problem.
I also went through countless exercises on the web, but the examples are all trivial - it's either asyncio.sleep() or some webscraping with a finite list of urls.
The solution that I have come up so far kinda works - the asyncio.gather() method is very, very, useful, but I have not been able to 'feed' it from a generator. I'm currently just counting to an arbitrary size and then starting a .gather() operation. I've transcribed the code, with boring parts left out and I've tried to anonymise the code
I've tried solutions involving semaphores, queues, different event loops, but I'm failing every time. Ideally I'd like to be able to create Futures 'continuously' - I think I'm missing the logic of 'convert this awaitable call to a future'
I'd be grateful for any help!
import asyncio
from asyncio import Future
import zeep
from zeep.plugins import HistoryPlugin
history = HistoryPlugin()
max_concurrent_calls = 5
provoke_errors = True
def export_data_async(db_variant: str, order_nrs: set):
st = time.time()
results = []
loop = asyncio.get_event_loop()
def get_client1(service_name: str, system: Systems = Systems.ACME) -> Tuple[zeep.Client, zeep.client.Factory]:
client1 = zeep.Client(wsdl=system.wsdl_url(service_name=service_name),
transport=transport,
plugins=[history],
)
factory_ns2 = client1.type_factory(namespace='ns2')
return client1, factory_ns2
table = 'ZZZZ'
moveback_table = 'EEEEEE'
moveback_dict = create_default_empty_ordered_dict('attribute1 attribute2 attribute3 attribute3')
client, factory = get_client1(service_name='ACMEServiceName')
if log.isEnabledFor(logging.DEBUG):
client.wsdl.dump()
zeep_log = logging.getLogger('zeep.transports')
zeep_log.setLevel(logging.DEBUG)
with Db(db_variant) as db:
db.open_db(CON_STRING[db_variant])
db.init_table_for_read(table, order_list=order_nrs)
counter_failures = 0
tasks = []
sids = []
results = []
def handle_future(future: Future) -> None:
results.extend(future.result())
def process_tasks_concurrently() -> None:
nonlocal tasks, sids, counter_failures, results
futures = asyncio.gather(*tasks, return_exceptions=True)
futures.add_done_callback(handle_future)
loop.run_until_complete(futures)
for i, response_or_fault in enumerate(results):
if type(response_or_fault) in [zeep.exceptions.Fault, zeep.exceptions.TransportError]:
counter_failures += 1
log_webservice_fault(sid=sids[i], db=db, err=response_or_fault, object=table)
else:
db.write_dict_to_table(
moveback_table,
{'sid': sids[i],
'attribute1': response_or_fault['XXX']['XXX']['xxx'],
'attribute2': response_or_fault['XXX']['XXX']['XXXX']['XXX'],
'attribute3': response_or_fault['XXXX']['XXXX']['XXX'],
}
)
db.commit_db_con()
tasks = []
sids = []
results = []
return
for row in db.rows(table):
if int(row.id) % 2 == 0 and provoke_errors:
payload = faulty_message_payload(row=row,
factory=factory,
)
else:
payload = message_payload(row=row,
factory=factory,
)
tasks.append(client.service.myRequest(
MessageHeader=factory.MessageHeader(**message_header_arguments(row=row)),
myRequestPayload=payload,
_soapheaders=[security_soap_header],
))
sids.append(row.sid)
if len(tasks) == max_concurrent_calls:
process_tasks_concurrently()
if tasks: # this is the remainder of len(db.rows) % max_concurrent_calls
process_tasks_concurrently()
loop.run_until_complete(transport.session.close())
db.execute_this_statement(statement=update_sql)
db.commit_db_con()
log.info(db.activity_log)
if counter_failures:
log.info(f"{table :<25} Count failed: {counter_failures}")
print("time async: %.2f" % (time.time() - st))
return results
Failed attempt with Queue: (blocks at await client.service)
loop = asyncio.get_event_loop()
counter = 0
results = []
async def payload_generator(db_variant: str, order_nrs: set):
# code that generates the data for the request
yield counter, row, payload
async def service_call_worker(queue, results):
while True:
counter, row, payload = await queue.get()
results.append(await client.service.myServicename(
MessageHeader=calculate_message_header(row=row)),
myPayload=payload,
_soapheaders=[security_soap_header],
)
)
print(colorama.Fore.BLUE + f'after result returned {counter}')
# Here do the relevant processing of response or error
queue.task_done()
async def main_with_q():
n_workers = 3
queue = asyncio.Queue(n_workers)
e = pprint.pformat(queue)
p = payload_generator(DB_VARIANT, order_list_from_args())
results = []
workers = [asyncio.create_task(service_call_worker(queue, results))
for _ in range(n_workers)]
async for c in p:
await queue.put(c)
await queue.join() # wait for all tasks to be processed
for worker in workers:
worker.cancel()
if __name__ == '__main__':
try:
loop.run_until_complete(main_with_q())
loop.run_until_complete(transport.session.close())
finally:
loop.close()

Grinder JDBC test script error" The result of 'TestRunner()' is not callable"

I use JDBC.py script run performance testing . grinder log info:
2015-10-14 18:42:40,132 ERROR com-0 thread-24: aborting thread - {}The result of 'TestRunner()' is not callable
net.grinder.scriptengine.jython.JythonScriptExecutionException: The result of 'TestRunner()' is not callable
at net.grinder.scriptengine.jython.JythonScriptEngine.createWorkerRunnable(JythonScriptEngine.java:183) ~[grinder-core-3.11.jar:na]
at net.grinder.engine.process.GrinderProcess$ThreadStarterImplementation$2.create(GrinderProcess.java:784) ~[grinder-core-3.11.jar:na]
at net.grinder.engine.process.GrinderThread.run(GrinderThread.java:90) ~[grinder-core-3.11.jar:na]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_45]
2015-10-14 18:42:40,132 ERROR com-0 thread-3: aborting thread - {}The result of 'TestRunner()' is not callable
net.grinder.scriptengine.jython.JythonScriptExecutionException: The result of 'TestRunner()' is not callable
at net.grinder.scriptengine.jython.JythonScriptEngine.createWorkerRunnable(JythonScriptEngine.java:183) ~[grinder-core-3.11.jar:na]
at net.grinder.engine.process.GrinderProcess$ThreadStarterImplementation$2.create(GrinderProcess.java:784) ~[grinder-core-3.11.jar:na]
at net.grinder.engine.process.GrinderThread.run(GrinderThread.java:90) ~[grinder-core-3.11.jar:na]
at java.lang.Thread.run(Thread.java:744) [na:1.7.0_45]
I modify script, but still error. Please help check it.
I test script :
# The sorting tes supports a configurable array length.
# It runs the JavaTest.sort method of the JavaTest class.
from net.grinder.script.Grinder import grinder
from net.grinder.script import Test
from datetime import datetime
from datetime import timedelta
from java.sql import DriverManager
from oracle.jdbc import OracleDriver
########################################
#
# main body of test script starts here
#
########################################
# Get the propeties to access test configuration information
properties = grinder.getProperties()
# The description is a property (instead of a hardcoded string in this script)
#test = Test(1, properties.get("javatest.description"))
test = Test(2, properties.get("javatest.description"))
# select the method for which to collect information
# test.record(WriteMulitpleLittleFile.write)
# initialize data for compressing
# fileName = properties.get("javatest.fileToCompress")
# grinder.logger.info("data file to compress is " + fileName)
# JavaTest.initializeCompression(fileName)
# If the run mode is runOnce, the TestRunner class will
# run once. Otherwise, if the run mode is continuous,
# the TestRunner class will run the test for at least
# the specified duration (but possibly longer)
runMode = properties.get("javatest.runMode")
#WriteMulitpleLittleFile.setParameters(dir, fileSize...)
if runMode == "continuous":
# figure out how long to run the test
m = int(properties.getProperty("javatest.durationMinutes", "0"))
h = int(properties.getProperty("javatest.durationHours", "0"))
d = int(properties.getProperty("javatest.durationDays", "0"))
duration = timedelta(minutes=m,hours=h,days=d)
grinder.logger.info("run mode is continuous, duration is " + str(duration))
elif runMode == "runOnce":
grinder.logger.info("run mode is run once")
duration = timedelta(minutes=0)
else:
grinder.logger.info("run mode not set or not recongized, default to run once")
duration = timedelta(minutes=0)
########################################
#
# The TestRunner class is used by The Grinder to perform the test
#
########################################
#test1 = Test(1, "Database insert")
test2 = Test(2, "Database query")
# Load the Oracle JDBC driver.
DriverManager.registerDriver(OracleDriver())
def getConnection():
return DriverManager.getConnection(
"jdbc:oracle:thin:#den00bvr.us.oracle.com:1521:orcl", "PBPUBLIC", "PBPUBLIC")
def ensureClosed(object):
try: object.close()
except: pass
# One time initialisation that cleans out old data.
connection = getConnection()
statement = connection.createStatement()
#try: statement.execute("drop table grinder_test1126")
#except: pass
#statement.execute("create table grinder_test1126(thread number, run number)")
ensureClosed(statement)
ensureClosed(connection)
class TestRunner:
def __init__(self):
# tid = grinder.threadNumber
# if (grinder.threadNumber % 2 == 0):
# Even threadNumber
# Do insertStatement
# else:
# Odd threadNumber
# Do queryStatement
# def __call__(self):
# self.testRunner()
endTime = datetime.now() + duration
notDone = True
while notDone:
connection = None
insertStatement = None
queryStatement = None
notDone = datetime.now() < endTime
try:
connection = getConnection()
# insertStatement = connection.createStatement()
queryStatement = connection.createStatement()
# test1.record(insertStatement)
# insertStatement.execute("insert into grinder_test1126 values(%d, %d)" %
# (grinder.threadNumber, grinder.runNumber))
test2.record(queryStatement)
queryStatement.execute("select * from employee")
finally:
# ensureClosed(insertStatement)
ensureClosed(queryStatement)
ensureClosed(connection)
According to the documentation,
The TestRunner instance must be callable
A Python object is callable if it defines a call method. Each
worker thread performs a number of runs of the test script, as
configured by the property grinder.runs. For each run, the worker
thread calls its TestRunner; thus the call method can be thought
of as the definition of a run.
Your script requires a call function in order to be classified as callable.

Redis still fills up when results_ttl=0, Why?

Question: Why is redis filling up if the results of jobs are discarded immediately?
I'm using redis as a queue to create PDFs asynchronously and then save the result to my database. Since its saved, I don't need to access the object a later date and so I don't need to keep store the result in Redis after its been processed.
To keep the result from staying in redis I've set the TTL to 0:
parameter_dict = {
"order": serializer.object,
"photo": base64_image,
"result_ttl": 0
}
django_rq.enqueue(procces_template, **parameter_dict)
The problem is although the redis worker says the job expires immediately:
15:33:35 Job OK, result = John Doe's nail order to 568 Broadway
15:33:35 Result discarded immediately.
15:33:35
15:33:35 *** Listening on high, default, low...
Redis still fills up and throws:
ResponseError: command not allowed when used memory > 'maxmemory'
Is there another parameter that I need to set in redis / django-rq to keep redis from filling up if the job result is already not stored?
Update:
Following this post I expect the memory might be filling up because of the failed jobs in redis.
Using this code snippet:
def print_redis_failed_queue():
q = django_rq.get_failed_queue()
while True:
job = q.dequeue()
if not job:
break
print job
here is a paste bin of a dump of the keys in redis:
http://pastebin.com/Bc4bRyRR
Its too long to be pragmatic to post here. Its size seems to support my theory. But using:
def delete_redis_failed_queue():
q = django_rq.get_failed_queue()
count = 0
while True:
job = q.dequeue()
if not job:
print "{} Jobs deleted.".format(count)
break
job.delete()
count += 1
Doest clear redis like i expect. How can I get a more accurate dump of the keys in redis? Am I clearing the jobs correctly?
It turns out Redis was filling up because of orphaned jobs, ie. jobs that were not assigned to a particular queue.
Although the cause of the orphaned jobs is unknown, the problem is solved with this snippet:
import redis
from rq.queue import Queue, get_failed_queue
from rq.job import Job
redis = Redis()
for i, key in enumerate(self.redis.keys('rq:job:*')):
job_number = key.split("rq:job:")[1]
job = Job.fetch(job_number, connection=self.redis)
job.delete()
In my particular situation, calling this snippet, (actually the delete_orphaned_jobs() method below ), after the competition of each job ensured that Redis would not fill up, and that orphaned jobs would be taken care of. For more details on the issue, here's a link to the conversation in the opened django-rq issue.
In the process of diagnosing this issue, I also created a utility class for inspecting and deleting jobs / orphaned jobs with ease:
class RedisTools:
'''
A set of utility tools for interacting with a redis cache
'''
def __init__(self):
self._queues = ["default", "high", "low", "failed"]
self.get_redis_connection()
def get_redis_connection(self):
redis_url = os.getenv('REDISTOGO_URL', 'redis://localhost:6379')
self.redis = redis.from_url(redis_url)
def get_queues(self):
return self._queues
def get_queue_count(self, queue):
return Queue(name=queue, connection=self.redis).count
def msg_print_log(self, msg):
print msg
logger.info(msg)
def get_key_count(self):
return len(self.redis.keys('rq:job:*'))
def get_queue_job_counts(self):
queues = self.get_queues()
queue_counts = [self.get_queue_count(queue) for queue in queues]
return zip(queues, queue_counts)
def has_orphanes(self):
job_count = sum([count[1] for count in self.get_queue_job_counts()])
return job_count < self.get_key_count()
def print_failed_jobs(self):
q = django_rq.get_failed_queue()
while True:
job = q.dequeue()
if not job:
break
print job
def print_job_counts(self):
for queue in self.get_queue_job_counts():
print "{:.<20}{}".format(queue[0], queue[1])
print "{:.<20}{}".format('Redis Keys:', self.get_key_count())
def delete_failed_jobs(self):
q = django_rq.get_failed_queue()
count = 0
while True:
job = q.dequeue()
if not job:
self.msg_print_log("{} Jobs deleted.".format(count))
break
job.delete()
count += 1
def delete_orphaned_jobs(self):
if not self.has_orphanes():
return self.msg_print_log("No orphan jobs to delete.")
for i, key in enumerate(self.redis.keys('rq:job:*')):
job_number = key.split("rq:job:")[1]
job = Job.fetch(job_number, connection=self.redis)
job.delete()
self.msg_print_log("[{}] Deleted job {}.".format(i, job_number))
You can use the "Black Hole" exception handler from http://python-rq.org/docs/exceptions/ with job.cancel():
def black_hole(job, *exc_info):
# Delete the job hash on redis, otherwise it will stay on the queue forever
job.cancel()
return False

Categories

Resources