Stop the running instances when max_instances is reached - python

I'm using apscheduler-django and I created a task that loops every 10 seconds.
This function will make a request to an API and save the content to my database (PostgreSQL).
This is my task:
scheduler.add_job(
SaveAPI,
trigger=CronTrigger(second="*/10"),
id="SaveAPI",
max_instances=1,
replace_existing=True,
)
and my SaveAPI is:
def SaveAPI():
SPORT = 3
print('API Queue Started')
AllMatches = GetAllMatches(SPORT)
for Match in AllMatches:
AddToDatabase(Match, SPORT)
print(f'API Queue Ended')
The GetAllMatches and AddToDatabase are too big and I don't think the implementations are relevant to my question.
My problem is sometimes I will get this error:
Run time of job "SaveAPI (trigger: cron[second='*/10'], next run at: 2022-03-05 23:21:00 +0330)" was missed by 0:00:11.445357
When this happens, it will not get replaced with a new instance because my SaveAPI function doesn't end. And apscheduler will always miss new instances.
I did many tests and function does not have any problem.
How can I make apscheduler stop the last running instance if a new instance is going to be missed?
So if my last instance takes more than 10 seconds, I want to just terminate the instance and create a new one.

apscheduler and apscheduler-django don't directly support that.
You can implement and use a custom executor that tracks the process running a job and kills the process if trying to submit a job that is currently running.
Here's a MaxInstancesCancelEarliestProcessPoolExecutor that uses pebble.ProcessPool.
class MaxInstancesCancelEarliestProcessPoolExecutor(BasePoolExecutor):
def __init__(self):
pool = ProcessPool()
pool.submit = lambda function, *args: pool.schedule(function, args=args)
super().__init__(pool)
self._futures = defaultdict(list)
def submit_job(self, job, run_times):
assert self._lock is not None, 'This executor has not been started yet'
with self._lock:
if self._instances[job.id] >= job.max_instances:
f = self._futures[job.id][0] # +
f.cancel() # +
try: # +
self._pool._pool_manager.update_status() # +
except RuntimeError: # +
pass # +
if self._instances[job.id] >= job.max_instances: # +
raise MaxInstancesReachedError(job)
self._do_submit_job(job, run_times)
self._instances[job.id] += 1
def _do_submit_job(self, job, run_times):
def callback(f):
with self._lock: # +
self._futures[job.id].remove(f) # +
try: # +
exc, tb = (f.exception_info() if hasattr(f, 'exception_info') else
(f.exception(), getattr(f.exception(), '__traceback__', None)))
except CancelledError: # +
exc, tb = TimeoutError(), None # +
if exc:
self._run_job_error(job.id, exc, tb)
else:
self._run_job_success(job.id, f.result())
try:
f = self._pool.submit(run_job, job, job._jobstore_alias, run_times, self._logger.name)
except BrokenProcessPool:
self._logger.warning('Process pool is broken; replacing pool with a fresh instance')
self._pool = self._pool.__class__(self._pool._max_workers)
f = self._pool.submit(run_job, job, job._jobstore_alias, run_times, self._logger.name)
f.add_done_callback(callback)
self._futures[job.id].append(f) # +
def shutdown(self, wait=True):
if wait:
self._pool.close()
self._pool.join()
else:
self._pool.close()
threading.Thread(target=self._pool.join).start()
Usage:
scheduler.add_executor(MaxInstancesCancelEarliestProcessPoolExecutor(), alias='max_instances_cancel_earliest')
scheduler.add_job(
SaveAPI,
trigger=CronTrigger(second="*/10"),
id="SaveAPI",
max_instances=1,
executor='max_instances_cancel_earliest', # +
replace_existing=True,
)

Related

how to write python concurrent future and retry correctly? Tableau API

I am a newbie to concurrent. I am trying to refresh 3 workbooks in Tableau:
workbook_dict = {"workbook1":"workbook_id1","workbook2":"workbook_id2","workbook3":"workbook_id3"}
#retry(tries=3, delay=5, backoff=0.2)
def refresh_workbooks(self, workbook_dict):
for workbook_name, workbook_id in workbook_dict.items():
workbook = self.server.workbooks.get_by_id(workbook_id)
#refresh will fire up the refresh, and return job_id object
job_id = self.server.workbooks.refresh(workbook)
#wait_for job will check the status and raise exception if fails or timeout
#https://tableau.github.io/server-client-python/docs/api-ref#jobswait_for_job
self.server.jobs.wait_for_job(job_id,timeout=1000)
This base code totally works and each workbook takes 15mins, so totally 45min to complete; And if it fails on the second workbook, it will start over from scratch.
I want to use concurrent to speed up and check wait_for_job. And if any fails then refresh only that workbook, retry a few times before throwing errors.
First question: The below code is my attempt to try concurrent but the printed data object returns None.
I think the code failed to execute the wait_for job
Why is that?
import concurrent.futures
import urllib.request
import tableauserverclient as TSC
def refresh_workbook(self, workbook_id):
workbook = self.server.workbooks.get_by_id(workbook_id)
job_id = self.server.workbooks.refresh(workbook)
return job_id
def refresh_workbooks(self, workbook_dict):
job_dict = {}
try:
for workbook_name, workbook_id in workbook_dict.items():
workbook = self.server.workbooks.get_by_id(workbook_id)
job_id = self.server.workbooks.refresh(workbook)
job_dict[workbook_name] = job_id.id
print(job_dict)
except:
raise
return job_dict
def wait_workbook(self, job_id, timeout=None):
self.server.jobs.wait_for_job(job_id, timeout=timeout)
test = TableauServerConnection()
workbook_dict = {"workbook1":"workbook_id1","workbook2":"workbook_id2","workbook3":"workbook_id3"}
jobs = test.refresh_workbooks(workbook_dict)
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_job = {executor.submit(test.wait_workbook, job_id, 1800): job_id for workbook_name, job_id in jobs.items()}
for future in concurrent.futures.as_completed(future_to_job):
job = future_to_job[future]
try:
data = future.result()
#Why did I get None in data?
print('data',data)
except Exception as exc:
#can I spin up new future here and what is the correct syntax?
print('%s generated an exception: %s' % (job, exc))
else:
print('job', job)
Secondly, if I add retry in exception, could I add new future object there?
You migth want to try something like this,
I'm using a while loop and getting the lastest result.
Then for each finished task I can now add some new task.
In your case only when a task fails.
But this allow you to to also not submit all the task at the begining. Depending on the number of task you want to run, it might be better to add X task at the begining (Where X > max_workers) and then each time a task finishes you add one more.
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_job = {executor.submit(test.wait_workbook, job_id, 1800): job_id for workbook_name, job_id in jobs.items()}
while future_to_job:
done, _ = concurrent.futures.wait(
future_to_job, timeout=0.25,
return_when=concurrent.futures.FIRST_COMPLETED)
for future in done:
try:
data = future.result()
#Why did I get None in data?
print('data',data)
except Exception as exc:
#can I spin up new future here and what is the correct syntax?
print('%s generated an exception: %s' % (job, exc))
# get the jobs id
job_id = future_to_job[future]
# clean up the old task
del future_to_job[future]
# add the wen one
future_to_job[executor.submit(test.wait_workbook, job_id, 1800)] = job_id
else:
print('job', job)
For the None part, your wait_workbook function should return the result
def wait_workbook(self, job_id, timeout=None):
return self.server.jobs.wait_for_job(job_id, timeout=timeout)
hope it helps :)

Log process output to separate log file

I have the following parallel_executor.py module which I use to run several processes simultaneously,
import time
from multiprocessing import Process
class ParallelExecutor(object):
def __init__(self, pool_size=10):
self._pool_size = pool_size
self._processes = []
self._results = []
def add_task(self, target, args=None, kwargs=None):
args = [] if not args else args
kwargs = {} if not kwargs else kwargs
index = len(self._processes)
process_args = (index, target, args, kwargs)
process = Process(target=self._executor, args=process_args)
self._processes.append(process)
result = {'result': None, 'end_time': 0, 'completed': False}
self._results.append(result)
return index
def run(self, block=True):
if not block:
for process in self._processes:
process.start()
return None
else:
counter = 0
processes = []
for process in self._processes:
processes.append(process)
process.start()
if counter >= self._pool_size:
# Wait for completion and reset counters.
for i in range(len(processes)):
processes[i].join()
processes = []
counter = 0
continue
counter += 1
# Wait for the left over processes to complete.
if len(processes) > 0:
for i in range(len(processes)):
processes[i].join()
return self._results
def _executor(self, index, target, args, kwargs):
try:
self._results[index]['result'] = target(*args, **kwargs)
self._results[index]['end_time'] = int(round((time.time())))
self._results[index]['completed'] = True
except Exception as exc:
self._results[index]['exception'] = exc
self._results[index]['completed'] = True
raise
And I use it as follows(example.py):
from framework.lib.parallel_executor import ParallelExecutor
import time
import os
def foo(x):
for i in range(3):
print x
time.sleep(0.5)
return 123
def main():
runner = ParallelExecutor()
runner.add_task(foo, ["This"])
runner.add_task(foo, ["is"])
runner.add_task(foo, ["a"])
runner.add_task(foo, ["test"])
runner.run()
runner.wait_for_executor_to_finish()
for i in runner.get_results():
print i
main()
My question is how do I print the process ID with every statement of 'foo' that is printed to the output by making changes only to parallel_executor.py module and not touching the example.py file, so that later I could perform a 'grep' on outputs of a particular process.
You can't do it without modifying the example at all, but you can achieve what you want with a very small modification.
Using the Python logging facilities, you can set the default log message ensuring every logline will respect your standard.
In the parallel_executor.py add the following:
import logging
log_format = "%(process)d: %(message)s"
logging.basicConfig(level=logging.INFO, format=log_format)
In the example replace the line:
print x
with:
logging.info(x)
And you will see your messages appearing as:
34321: message content here

Redis still fills up when results_ttl=0, Why?

Question: Why is redis filling up if the results of jobs are discarded immediately?
I'm using redis as a queue to create PDFs asynchronously and then save the result to my database. Since its saved, I don't need to access the object a later date and so I don't need to keep store the result in Redis after its been processed.
To keep the result from staying in redis I've set the TTL to 0:
parameter_dict = {
"order": serializer.object,
"photo": base64_image,
"result_ttl": 0
}
django_rq.enqueue(procces_template, **parameter_dict)
The problem is although the redis worker says the job expires immediately:
15:33:35 Job OK, result = John Doe's nail order to 568 Broadway
15:33:35 Result discarded immediately.
15:33:35
15:33:35 *** Listening on high, default, low...
Redis still fills up and throws:
ResponseError: command not allowed when used memory > 'maxmemory'
Is there another parameter that I need to set in redis / django-rq to keep redis from filling up if the job result is already not stored?
Update:
Following this post I expect the memory might be filling up because of the failed jobs in redis.
Using this code snippet:
def print_redis_failed_queue():
q = django_rq.get_failed_queue()
while True:
job = q.dequeue()
if not job:
break
print job
here is a paste bin of a dump of the keys in redis:
http://pastebin.com/Bc4bRyRR
Its too long to be pragmatic to post here. Its size seems to support my theory. But using:
def delete_redis_failed_queue():
q = django_rq.get_failed_queue()
count = 0
while True:
job = q.dequeue()
if not job:
print "{} Jobs deleted.".format(count)
break
job.delete()
count += 1
Doest clear redis like i expect. How can I get a more accurate dump of the keys in redis? Am I clearing the jobs correctly?
It turns out Redis was filling up because of orphaned jobs, ie. jobs that were not assigned to a particular queue.
Although the cause of the orphaned jobs is unknown, the problem is solved with this snippet:
import redis
from rq.queue import Queue, get_failed_queue
from rq.job import Job
redis = Redis()
for i, key in enumerate(self.redis.keys('rq:job:*')):
job_number = key.split("rq:job:")[1]
job = Job.fetch(job_number, connection=self.redis)
job.delete()
In my particular situation, calling this snippet, (actually the delete_orphaned_jobs() method below ), after the competition of each job ensured that Redis would not fill up, and that orphaned jobs would be taken care of. For more details on the issue, here's a link to the conversation in the opened django-rq issue.
In the process of diagnosing this issue, I also created a utility class for inspecting and deleting jobs / orphaned jobs with ease:
class RedisTools:
'''
A set of utility tools for interacting with a redis cache
'''
def __init__(self):
self._queues = ["default", "high", "low", "failed"]
self.get_redis_connection()
def get_redis_connection(self):
redis_url = os.getenv('REDISTOGO_URL', 'redis://localhost:6379')
self.redis = redis.from_url(redis_url)
def get_queues(self):
return self._queues
def get_queue_count(self, queue):
return Queue(name=queue, connection=self.redis).count
def msg_print_log(self, msg):
print msg
logger.info(msg)
def get_key_count(self):
return len(self.redis.keys('rq:job:*'))
def get_queue_job_counts(self):
queues = self.get_queues()
queue_counts = [self.get_queue_count(queue) for queue in queues]
return zip(queues, queue_counts)
def has_orphanes(self):
job_count = sum([count[1] for count in self.get_queue_job_counts()])
return job_count < self.get_key_count()
def print_failed_jobs(self):
q = django_rq.get_failed_queue()
while True:
job = q.dequeue()
if not job:
break
print job
def print_job_counts(self):
for queue in self.get_queue_job_counts():
print "{:.<20}{}".format(queue[0], queue[1])
print "{:.<20}{}".format('Redis Keys:', self.get_key_count())
def delete_failed_jobs(self):
q = django_rq.get_failed_queue()
count = 0
while True:
job = q.dequeue()
if not job:
self.msg_print_log("{} Jobs deleted.".format(count))
break
job.delete()
count += 1
def delete_orphaned_jobs(self):
if not self.has_orphanes():
return self.msg_print_log("No orphan jobs to delete.")
for i, key in enumerate(self.redis.keys('rq:job:*')):
job_number = key.split("rq:job:")[1]
job = Job.fetch(job_number, connection=self.redis)
job.delete()
self.msg_print_log("[{}] Deleted job {}.".format(i, job_number))
You can use the "Black Hole" exception handler from http://python-rq.org/docs/exceptions/ with job.cancel():
def black_hole(job, *exc_info):
# Delete the job hash on redis, otherwise it will stay on the queue forever
job.cancel()
return False

wxPython app (Not Responding) temporarly

A file selection triggers the code bellow.
This runs a loop pulsing the progress-bar while a thread runs a few DELETE requests.
Half way through the requests the window shows (Not responding) that goes away once the thread finishes and all goes back to normal.
What am I doing wrong and how can I get the (Not Responding) issue gone?
It's been driving me mental for hours trying to find the cause.
I think it's related to the requests but I cannot figure out how.
def Process(self, event):
self._thread_end = False
self._process_success = False
threading.Thread(target=self.ProcessThread).start()
while not self._thread_end:
time.sleep(0.05)
self._gauge.Pulse()
self._gauge.SetValue(0)
self.PageResult(self._process_success)
def ProcessThread(self):
try:
csv_file = self._csv_dirname + '/' + self._csv_filename
if os.path.isfile(csv_file):
csv_contents = open(csv_file).read().replace('\r\n', '\n')
csv_list = filter(None, csv_contents.split('\n'))
for entry in csv_list:
http = urllib2.Request(self._logged_url + '/api/' + entry)
http.get_method = lambda: 'DELETE'
result = urllib2.urlopen(http)
self._process_success = True
except:
self._process_success = False
exc_type, exc_value, exc_traceback = sys.exc_info()
ExceptCatch(exc_type.__name__, exc_value, exc_traceback, threading.current_thread().name)
self._thread_end = True

This Non-Threaded script unexpectedly runs faster than the Threaded version

I have a python script which validates data fetched from some rows in a database and then logs the errors in a different table in the same database.
The script validates each row and marks it as validated & has error = True/False depending on the validation outcome. This process is repeated for each row. With that, I thought I'd add some steroids by creating threads such that the validation for each row is done by independent threads thus reducing the time it takes to validate a batch of rows.
To my surprise, I find that the threaded script is taking slightly longer than the non-threaded one. On average to validate 1502 rows of data it takes the Non-Threaded script 1.5 seconds while the threaded script takes 2.27 seconds. That might not be much but ideally I'll be running through 2 million records at a go so that time overhead will be significant. That plus I would assume that threaded apps would finish faster! :-)
The two scripts clock the same time of about 0.01 seconds upto the point of creating threads. By this point the SQLAlchemy session is created and all the data to be validated and relations i.e foreign keys etc are fetched. From there though, the non-threaded script finishes faster. Below is my code.
1.0 None-Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
no_of_records += 1
validator = Validate()
validator.validator(no_of_records, data_list)
record_counter += lf_detail_counter
data_list = None
no_of_records = 0
print("Validated '%s': seconds %s" %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Reset counters
lf_detail_counter = 0
lf_counter += 1
#Commit The Entire Transaction.
session.commit()
print("NoThread:Finished validating %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()- process_start_time))
1.1. Validation Function for Non-Threaded Script
class Validate():
has_error = None
def validator(self, loop_length, job):
'''Validate data'''
for row_counter in range(loop_length):
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job[row_counter]
error_detail = ErrorLogDetail()
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 2"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
else:
#Continue with extra validation
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
2.0 Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
queue_size += 1
if queue_size == THREAD_LIMIT:
myqueuing(queue_size, data_list)
queue_size = 0
#spawn a pool of threads, and pass them queue instance
if queue_size > 0:
myqueuing(queue_size, data_list)
#Keep record of rows processed
record_counter += lf_detail_counter
print("Validated '%s': seconds- %s " %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Commit The Entire Transaction.
session.commit()
#Reset counters
lf_detail_counter = 0
lf_counter += 1
data_list = None
queue_size = 0
print("HasThread:Finished loading %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()-process_start_time)) #print time it took to run'
2.1. Threaded Validation Function
THREAD_LIMIT = 50 # This is how many threads we want
jobs = queue.Queue() # This sets up the queue object to use 5 slots
singlelock = threading.Lock() # This is a lock so threads don't print trough each other (and other reasons)
def myqueuing(queuesize, data):
'''Put the fetched data in a queue and instantiate threads to
process the queue'''
# Spawn the threads
is_valid_date("20131212", True) #Calling this here to avoid a bug in time.striptime() when threading
for x in range(queuesize):
# This is the thread class that we instantiate.
workerbee().start()
# Put stuff in queue
for i in range(queuesize):
# Block if queue is full, and wait 2 seconds. After 5s raise Queue Full error.
try:
jobs.put(data[i], block=True, timeout=2)
except:
singlelock.acquire()
print ("The queue is full !")
singlelock.lock.release()
# Wait for the threads to finish
singlelock.acquire() # Acquire the lock so we can print
print ("Waiting for threads to finish.")
singlelock.release() # Release the lock
jobs.join() # This command waits for all threads to finish.
class workerbee(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.lock = threading.Lock()
self.has_error = False
def run(self):
#try:
job = jobs.get(True,1)
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job
'''Validates the data.'''
error_detail = ErrorLogDetail()
#Again please note that this part is identical for both the non-threaded and the threaded script.
#After each pass on a record, the record is marked as validated and if has_error = True
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message2")
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
else:
#Continue with further validation - about 5 other validation checks
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
3.0. Common function for setting validation in both threaded and non-threaded
def set_validation(self, load_file_detail, load_file_detail_id, has_error, can_be_loaded):
'''Mark the record as having been validated and whether has error = True or False'''
#print("haserror and canbeloaded ", has_error, can_be_loaded)
is_done = load_file_detail.set_validation_and_error(load_file_detail_id, True, has_error, can_be_loaded)
if is_done == False:
raise Exception ("Can't update load_file_detail's is_validated parameter: ", load_file_detail_id)
3.1. Actual SQLAlchemy session for saving the validation status
def set_validation_and_error(self, load_file_detail_id, is_validated, has_error, can_be_loaded):
result = session.execute('UPDATE load_file_detail SET is_validated=%s, has_error=%s, can_be_loaded=%s WHERE id=%s' \
%(is_validated, has_error, can_be_loaded, load_file_detail_id))
So, the fetching of data to be validated is the same and both scripts take same amount of time up to that point. The validation process is the same for both scripts and saving to DB is the same i.e. Section 3.0 and 3.1 are shared by both scripts. The only difference is the validation with multiple threads. So am thinking maybe there is something about the multiple threads and SQLAlchemy that is making the app slower in threaded mode? Have I implemented the threaded function in the proper way? One of those or threading is not suitable in this scenario. Suggestions welcome.
You must create Queue for logging and add "logger" thread. So you remove locks code must be faster.
Also create DB connections in each thread to be able to get data in parallel.
Treads parallelize only C-library calls because of GIL.
For parallelize python code You must use multiprocessing.
I write test for You, describing how to process iterable:
def produce_data(amount=100000, invalid=1, valid=10):
# produce_data = sql('request').getall()
import random
id = 0
data = [True]*valid + [False]*invalid
while id < amount:
id+=1
yield (id,random.choice(data))
def validate(row):
if row[1]:
time.sleep(0.001) #set valid sql request emulation.
return True
else:
time.sleep(0.001) #set invalid sql request emulation.
return False
def single():
for row in produce_data():
validate(row)
def targeted():
import threading
for row in produce_data():
threading.Thread(target=validate,args=(row,))
Uley = 50
class Bee(object):
error=False
running = True
def __init__(self,queue,*args,**kwargs):
self.queue=queue #dont use any global variable!
# every bee must have unique db connection and session.
#self.session = db.connection().session()
# initialize it there.
return super(Bee,self).__init__(*args,**kwargs)
def run(self):
while self.running:
data=self.queue.get()
if data:
self.error = validate(data) # refactor it to self.validate(data) to be able to get cursor from self.session.
self.queue.task_done()
else:
self.queue.task_done()
break
#self.session.commit()
def treaded():
import threading,Queue
class TreadedBee(Bee,threading.Thread): pass
q = Queue.Queue()
for i in range(Uley): #bees started before data was provided.
bee=TreadedBee(q)
bee.daemon = True
bee.start()
for row in produce_data(): #you dont need to get all data to begin processing, at this place must be cursor of response.
q.put(row)
q.join()
for i in range(Uley):
q.put(None)
def forked():
from multiprocessing import Process,JoinableQueue
class ForkedBee(Bee,Process): pass
q = JoinableQueue()
for i in range(Uley):
bee=ForkedBee(q)
bee.start()
for row in produce_data():
q.put(row)
q.join()
#at this you need to kill zomBee -)
for i in range(Uley):
q.put(None)
q.close()
def pool():
from multiprocessing import Pool
pool = Pool(processes=Uley)
pool.map(validate,produce_data())
if __name__ == "__main__":
import time
s=time.time()
single()
print(time.time()-s) #109
s=time.time()
single()
print(time.time()-s) #6
s=time.time()
treaded()
print(time.time()-s) #12
s=time.time()
forked()
print(time.time()-s) #6
s=time.time()
pool()
print(time.time()-s) #4
test result:
$ python2 tgreads.py
109.779700994
5.84457302094
12.3814198971
5.97618508339
3.69856286049
targeted will flood CPU, memory and you cant provide individual connections to DB, using shared connection is not safe. If want to go in this way - you need to provide output queue and realize collector, that will communicate with DB. pool is short-code and fastest, but not friendly to initiate per-worker connections.

Categories

Resources