ThreadPool from python's multiprocessing hangs out - python

I have a phantom problem with one of my unit tests.
I use a ThreadPool from multiprocessing package for wrapping stdout and stderr funtions from my class utilizing paramiko. During creation I made some real life tests using code below and it is working nicely. But during writing unit test for that code I managed to get into problem, that this usage of ThreadPool hangs out.
This part hangs out for like 95 percent of time and somehow sometimes executes properly.
while not (self.__stdout_async_r.ready() and self.__stderr_async_r.ready()):
time.sleep(WAIT_FOR_DATA)
I've checked the values during debugging and I've found out that sometimes there is one or other condition set to finished but the other is not. But both functions are already finished so the results is just asking for the state that is never changed in the future.
The code for reproduce (with functionality necessary for this issue):
import time
from multiprocessing.pool import ThreadPool
class ExecResult(object):
def __init__(self, command=None, exit_status_func=None,
receive_stdout_func=None, receive_stderr_func=None,
connection=None):
self.connection = connection
self.stdout = None
self.stderr = None
self.ecode = None
self.ts_stop = None
self._exit_status_f = exit_status_func
self.result_available = False
self.__fetch_streams(receive_stdout_func, receive_stderr_func)
def wait_for_data(self):
WAIT_FOR_DATA = 0.1
if not self.result_available:
# Here it hangs out for 95 percent
while not (self.__stdout_async_r.ready() and self.__stderr_async_r.ready()):
time.sleep(WAIT_FOR_DATA)
self.result_available = True
self.ts_stop = time.time()
self.stdout = self.__stdout_async_r.get(timeout=2)
self.stderr = self.__stderr_async_r.get(timeout=2)
self.ecode = self._exit_status_f()
def __fetch_streams(self, stdout_func, stderr_func):
stdout_t = ThreadPool(processes=1)
stderr_t = ThreadPool(processes=1)
self.__stdout_async_r = stdout_t.apply_async(func=stdout_func)
self.__stderr_async_r = stderr_t.apply_async(func=stderr_func)
stdout_t.close()
stderr_t.close()
def stderr():
return "stderr"
def stdout():
return "stdout"
def exit():
return "0"
# actual reproduction
res = ExecResult(None, exit, stdout, stderr, None)
res.wait_for_data() #if are data available get them or wait
print res.stdout
print res.stderr
print res.ecode

As it usually is, I found out an answer for this after some time spent cursing and doing some tea.
Solution is to add this after close methods:
stdout_t.join()
stderr_t.join()
So this is the repaired part as whole:
def __fetch_streams(self, stdout_func, stderr_func):
stdout_t = ThreadPool(processes=1)
stderr_t = ThreadPool(processes=1)
self.__stdout_async_r = stdout_t.apply_async(func=stdout_func)
self.__stderr_async_r = stderr_t.apply_async(func=stderr_func)
stdout_t.close()
stderr_t.close()
stdout_t.join()
stderr_t.join()

Related

How to get every second's GPU usage in Python

I have a model which runs by tensorflow-gpu and my device is nvidia. And I want to list every second's GPU usage so that I can measure average/max GPU usage. I can do this mannually by open two terminals, one is to run model and another is to measure by nvidia-smi -l 1. Of course, this is not a good way. I also tried to use a Thread to do that, here it is.
import subprocess as sp
import os
from threading import Thread
class MyThread(Thread):
def __init__(self, func, args):
super(MyThread, self).__init__()
self.func = func
self.args = args
def run(self):
self.result = self.func(*self.args)
def get_result(self):
return self.result
def get_gpu_memory():
output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
ACCEPTABLE_AVAILABLE_MEMORY = 1024
COMMAND = "nvidia-smi -l 1 --query-gpu=memory.used --format=csv"
memory_use_info = output_to_list(sp.check_output(COMMAND.split()))[1:]
memory_use_values = [int(x.split()[0]) for i, x in enumerate(memory_use_info)]
return memory_use_values
def run():
pass
t1 = MyThread(run, args=())
t2 = MyThread(get_gpu_memory, args=())
t1.start()
t2.start()
t1.join()
t2.join()
res1 = t2.get_result()
However, this does not return every second's usage as well. Is there a good solution?
In the command nvidia-smi -l 1 --query-gpu=memory.used --format=csv
the -l stands for:
-l, --loop= Probe until Ctrl+C at specified second interval.
So the command:
COMMAND = 'nvidia-smi -l 1 --query-gpu=memory.used --format=csv'
sp.check_output(COMMAND.split())
will never terminate and return.
It works if you remove the event loop from the command(nvidia-smi) to python.
Here is the code:
import subprocess as sp
import os
from threading import Thread , Timer
import sched, time
def get_gpu_memory():
output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
ACCEPTABLE_AVAILABLE_MEMORY = 1024
COMMAND = "nvidia-smi --query-gpu=memory.used --format=csv"
try:
memory_use_info = output_to_list(sp.check_output(COMMAND.split(),stderr=sp.STDOUT))[1:]
except sp.CalledProcessError as e:
raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
memory_use_values = [int(x.split()[0]) for i, x in enumerate(memory_use_info)]
# print(memory_use_values)
return memory_use_values
def print_gpu_memory_every_5secs():
"""
This function calls itself every 5 secs and print the gpu_memory.
"""
Timer(5.0, print_gpu_memory_every_5secs).start()
print(get_gpu_memory())
print_gpu_memory_every_5secs()
"""
Do stuff.
"""
Here is a more rudimentary way of getting this output, however just as effective - and I think easier to understand. I added a small 10-value cache to get a good recent average and upped the check time to every second. It outputs average of the last 10 seconds and the current each second, so operations that cause usage can be identified (what I think the original question was).
import subprocess as sp
import time
memory_total=8192 #found with this command: nvidia-smi --query-gpu=memory.total --format=csv
memory_used_command = "nvidia-smi --query-gpu=memory.used --format=csv"
isolate_memory_value = lambda x: "".join(y for y in x.decode('ascii') if y in "0123456789")
def main():
percentage_cache = []
while True:
memory_used = isolate_memory_value(sp.check_output(memory_used_command.split(), stderr=sp.STDOUT))
percentage = float(memory_used)/float(memory_total)*100
percentage_cache.append(percentage)
percentage_cache = percentage_cache[max(0, len(percentage_cache) - 10):]
print("curr: " + str(percentage) + " %", "\navg: " + str(sum(percentage_cache)/len(percentage_cache))[:4] + " %\n")
time.sleep(1)
main()

Multiprocessing With r2pipe

I'm having issues with using r2pipe, Radare2's API, with the multiprocessing Pool.map function in python. The problem I am facing is the application hangs on pool.join().
My hope was to use multithreading via the multiprocessing.dummy class in order to evaluate functions quickly through r2pipe. I have tried passing my r2pipe object as a namespace using the Manager class. I have attempted using events as well, but none of these seem to work.
class Test:
def __init__(self, filename=None):
if filename:
self.r2 = r2pipe.open(filename)
else:
self.r2 = r2pipe.open()
self.r2.cmd('aaa')
def t_func(self, args):
f = args[0]
r2_ns = args[1]
print('afbj # {}'.format(f['name']))
try:
bb = r2_ns.cmdj('afbj # {}'.format(f['name']))
if bb:
return bb[0]['addr']
else:
return None
except Exception as e:
print(e)
return None
def thread(self):
funcs = self.r2.cmdj('aflj')
mgr = ThreadMgr()
ns = mgr.Namespace()
ns.r2 = self.r2
pool = ThreadPool(2)
results = pool.map(self.t_func, product(funcs, [ns.r2]))
pool.close()
pool.join()
print(list(results))
This is the class I am using. I make a call to the Test.thread function in my main function.
I expect the application to print out the command it is about to run in r2pipe afbj # entry0, etc. Then to print out the list of results containing the first basic block address [40000, 50000, ...].
The application does print out the command about to run, but then hangs before printing out the results.
ENVIRONMENT
radare2: radare2 4.2.0-git 23712 # linux-x86-64 git.4.1.1-97-g5a48a4017
commit: 5a48a401787c0eab31ecfb48bebf7cdfccb66e9b build: 2020-01-09__21:44:51
r2pipe: 1.4.2
python: Python 3.6.9 (default, Nov 7 2019, 10:44:02)
system: Ubuntu 18.04.3 LTS
SOLUTION
This may be due to passing the same instance of r2pipe.open() to every call of t_func in the pool. One solution is to move the following lines of code into t_func:
r2 = r2pipe.open('filename')
r2.cmd('aaa')
This works, however its terribly slow to reanalyze for each thread/process.
Also, it is often faster to allow radare2 to do as much of the work as possible and limit the number of commands we need to send using r2pipe.
This problem is solved by using the command: afbj ##f
afbj # List basic blocks of given function and show results in json
##f # Execute the command for each function
EXAMPLE
Longer Example
import r2pipe
R2: r2pipe.open_sync = r2pipe.open('/bin/ls')
R2.cmd("aaaa")
FUNCS: list = R2.cmd('afbj ##f').split("\n")[:-1]
RESULTS: list = []
for func in FUNCS:
basic_block_info: list = eval(func)
first_block: dict = basic_block_info[0]
address_first_block: int = first_block['addr']
RESULTS.append(hex(address_first_block))
print(RESULTS)
'''
['0x4a56', '0x1636c', '0x3758', '0x15690', '0x15420', '0x154f0', '0x15420',
'0x154f0', '0x3780', '0x3790', '0x37a0', '0x37b0', '0x37c0', '0x37d0', '0x0',
...,
'0x3e90', '0x6210', '0x62f0', '0x8f60', '0x99e0', '0xa860', '0xc640', '0x3e70',
'0xd200', '0xd220', '0x133a0', '0x14480', '0x144e0', '0x145e0', '0x14840', '0x15cf0']
'''
Shorter Example
import r2pipe
R2 = r2pipe.open('/bin/ls')
R2.cmd("aaaa")
print([hex(eval(func)[0]['addr']) for func in R2.cmd('afbj ##f').split("\n")[:-1]])

Python Multiprocessing Process causes Parent to idle

My question is very similar to this question here, except the solution with catching didn't quite work for me.
Problem: I'm using multiprocessing to handle a file in parallel. Around 97%, it works. However, sometimes, the parent process will idle forever and CPU usage shows 0.
Here is a simplified version of my code
from PIL import Image
import imageio
from multiprocessing import Process, Manager
def split_ranges(min_n, max_n, chunks=4):
chunksize = ((max_n - min_n) / chunks) + 1
return [range(x, min(max_n-1, x+chunksize)) for x in range(min_n, max_n, chunksize)]
def handle_file(file_list, vid, main_array):
for index in file_list:
try:
#Do Stuff
valid_frame = Image.fromarray(vid.get_data(index))
main_array[index] = 1
except:
main_array[index] = 0
def main(file_path):
mp_manager = Manager()
vid = imageio.get_reader(file_path, 'ffmpeg')
num_frames = vid._meta['nframes'] - 1
list_collector = mp_manager.list(range(num_frames)) #initialize a list as the size of number of frames in the video
total_list = split_ranges(10, min(200, num_frames), 4) #some arbitrary numbers between 0 and num_frames of video
processes = []
file_readers = []
for split_list in total_list:
video = imageio.get_reader(file_path, 'ffmpeg')
proc = Process(target=handle_file, args=(split_list, video, list_collector))
print "Started Process" #Always gets printed
proc.Daemon = False
proc.start()
processes.append(proc)
file_readers.append(video)
for i, proc in enumerate(processes):
proc.join()
print "Join Process " + str(i) #Doesn't get printed
fd = file_readers[i]
fd.close()
return list_collector
The issue is that I can see the processes starting and I can see that all of the items are being handled. However, sometimes, the processes don't rejoin. When I check back, only the parent process is there but it's idling as if it's waiting for something. None of the child processes are there, but I don't think join is called because my print statement doesn't show up.
My hypothesis is that this happens to videos with a lot of broken frames. However, it's a bit hard to reproduce this error because it rarely occurs.
EDIT: Code should be valid now. Trying to find a file that can reproduce this error.

Implementing multiprocessing in a loop scraper and appending the data

I am making a web scraper to build a database. The site I plan to use has index pages each containing 50 links. The amount of pages to be parsed is estimated to be around 60K and up, this is why I want to implement multiprocessing.
Here is some pseudo-code of what I want to do:
def harvester(index):
main=dict()
....
links = foo.findAll ( 'a')
for link in links:
main.append(worker(link))
# or maybe something like: map_async(worker(link))
def worker(url):
''' this function gather the data from the given url'''
return dictionary
Now what I want to do with that is to have a certain number of worker function to gather data in parallel on different pages. This data would then be appended to a big dictionary located in harvester or written directly in a csv file by the worker function.
I'm wondering how I can implement parallelism. I have done a faire
amount of research on using gevent, threading and multiprocessing but
I am not sure how to implement it.
I am also not sure if appending data to a large dictionary or writing
directly in a csv using DictWriter will be stable with that many input at the same time.
Thanks
I propose you to split your work into separate workers which communicate via Queues.
Here you mostly have IO wait time (crawling, csv writing)
So you can do the following (not tested, just see the idea):
import threading
import Queue
class CsvWriter(threading.Thread):
def __init__(self, resultq):
super(CsvWriter, self).__init__()
self.resultq = resultq
self.writer = csv.DictWriter(open('results.csv', 'wb'))
def run(self):
done = False
while not done:
row = self.requltq.get()
if row != -1:
self.writer.writerow(row)
else:
done = True
class Crawler(threading.Thread):
def __init__(self, inputqueue, resultq):
super(Crawler, self).__init__()
self.iq = inputq
self.oq = resultq
def run(self):
done = False
while not done:
link = self.iq.get()
if link != -1:
result = self.extract_data(link)
self.oq.put(result)
else:
done = True
def extract_data(self, link):
# crawl and extract what you need and return a dict
pass
def main():
linkq = Queue.Queue()
for url in your_urls:
linkq.put(url)
resultq = Queue.Queue()
writer = CsvWriter(resultq)
writer.start()
crawlers = [Crawler(linkq, resultq) for _ in xrange(10)]
[c.start() for c in crawlers]
[linkq.put(-1) for _ in crawlers]
[c.join() for c in crawlers]
resultq.put(-1)
writer.join()
This code should work (fix possible typos) and make it to exit when all the urls are finished

This Non-Threaded script unexpectedly runs faster than the Threaded version

I have a python script which validates data fetched from some rows in a database and then logs the errors in a different table in the same database.
The script validates each row and marks it as validated & has error = True/False depending on the validation outcome. This process is repeated for each row. With that, I thought I'd add some steroids by creating threads such that the validation for each row is done by independent threads thus reducing the time it takes to validate a batch of rows.
To my surprise, I find that the threaded script is taking slightly longer than the non-threaded one. On average to validate 1502 rows of data it takes the Non-Threaded script 1.5 seconds while the threaded script takes 2.27 seconds. That might not be much but ideally I'll be running through 2 million records at a go so that time overhead will be significant. That plus I would assume that threaded apps would finish faster! :-)
The two scripts clock the same time of about 0.01 seconds upto the point of creating threads. By this point the SQLAlchemy session is created and all the data to be validated and relations i.e foreign keys etc are fetched. From there though, the non-threaded script finishes faster. Below is my code.
1.0 None-Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
no_of_records += 1
validator = Validate()
validator.validator(no_of_records, data_list)
record_counter += lf_detail_counter
data_list = None
no_of_records = 0
print("Validated '%s': seconds %s" %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Reset counters
lf_detail_counter = 0
lf_counter += 1
#Commit The Entire Transaction.
session.commit()
print("NoThread:Finished validating %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()- process_start_time))
1.1. Validation Function for Non-Threaded Script
class Validate():
has_error = None
def validator(self, loop_length, job):
'''Validate data'''
for row_counter in range(loop_length):
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job[row_counter]
error_detail = ErrorLogDetail()
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 2"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
else:
#Continue with extra validation
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
2.0 Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
queue_size += 1
if queue_size == THREAD_LIMIT:
myqueuing(queue_size, data_list)
queue_size = 0
#spawn a pool of threads, and pass them queue instance
if queue_size > 0:
myqueuing(queue_size, data_list)
#Keep record of rows processed
record_counter += lf_detail_counter
print("Validated '%s': seconds- %s " %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Commit The Entire Transaction.
session.commit()
#Reset counters
lf_detail_counter = 0
lf_counter += 1
data_list = None
queue_size = 0
print("HasThread:Finished loading %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()-process_start_time)) #print time it took to run'
2.1. Threaded Validation Function
THREAD_LIMIT = 50 # This is how many threads we want
jobs = queue.Queue() # This sets up the queue object to use 5 slots
singlelock = threading.Lock() # This is a lock so threads don't print trough each other (and other reasons)
def myqueuing(queuesize, data):
'''Put the fetched data in a queue and instantiate threads to
process the queue'''
# Spawn the threads
is_valid_date("20131212", True) #Calling this here to avoid a bug in time.striptime() when threading
for x in range(queuesize):
# This is the thread class that we instantiate.
workerbee().start()
# Put stuff in queue
for i in range(queuesize):
# Block if queue is full, and wait 2 seconds. After 5s raise Queue Full error.
try:
jobs.put(data[i], block=True, timeout=2)
except:
singlelock.acquire()
print ("The queue is full !")
singlelock.lock.release()
# Wait for the threads to finish
singlelock.acquire() # Acquire the lock so we can print
print ("Waiting for threads to finish.")
singlelock.release() # Release the lock
jobs.join() # This command waits for all threads to finish.
class workerbee(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.lock = threading.Lock()
self.has_error = False
def run(self):
#try:
job = jobs.get(True,1)
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job
'''Validates the data.'''
error_detail = ErrorLogDetail()
#Again please note that this part is identical for both the non-threaded and the threaded script.
#After each pass on a record, the record is marked as validated and if has_error = True
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message2")
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
else:
#Continue with further validation - about 5 other validation checks
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
3.0. Common function for setting validation in both threaded and non-threaded
def set_validation(self, load_file_detail, load_file_detail_id, has_error, can_be_loaded):
'''Mark the record as having been validated and whether has error = True or False'''
#print("haserror and canbeloaded ", has_error, can_be_loaded)
is_done = load_file_detail.set_validation_and_error(load_file_detail_id, True, has_error, can_be_loaded)
if is_done == False:
raise Exception ("Can't update load_file_detail's is_validated parameter: ", load_file_detail_id)
3.1. Actual SQLAlchemy session for saving the validation status
def set_validation_and_error(self, load_file_detail_id, is_validated, has_error, can_be_loaded):
result = session.execute('UPDATE load_file_detail SET is_validated=%s, has_error=%s, can_be_loaded=%s WHERE id=%s' \
%(is_validated, has_error, can_be_loaded, load_file_detail_id))
So, the fetching of data to be validated is the same and both scripts take same amount of time up to that point. The validation process is the same for both scripts and saving to DB is the same i.e. Section 3.0 and 3.1 are shared by both scripts. The only difference is the validation with multiple threads. So am thinking maybe there is something about the multiple threads and SQLAlchemy that is making the app slower in threaded mode? Have I implemented the threaded function in the proper way? One of those or threading is not suitable in this scenario. Suggestions welcome.
You must create Queue for logging and add "logger" thread. So you remove locks code must be faster.
Also create DB connections in each thread to be able to get data in parallel.
Treads parallelize only C-library calls because of GIL.
For parallelize python code You must use multiprocessing.
I write test for You, describing how to process iterable:
def produce_data(amount=100000, invalid=1, valid=10):
# produce_data = sql('request').getall()
import random
id = 0
data = [True]*valid + [False]*invalid
while id < amount:
id+=1
yield (id,random.choice(data))
def validate(row):
if row[1]:
time.sleep(0.001) #set valid sql request emulation.
return True
else:
time.sleep(0.001) #set invalid sql request emulation.
return False
def single():
for row in produce_data():
validate(row)
def targeted():
import threading
for row in produce_data():
threading.Thread(target=validate,args=(row,))
Uley = 50
class Bee(object):
error=False
running = True
def __init__(self,queue,*args,**kwargs):
self.queue=queue #dont use any global variable!
# every bee must have unique db connection and session.
#self.session = db.connection().session()
# initialize it there.
return super(Bee,self).__init__(*args,**kwargs)
def run(self):
while self.running:
data=self.queue.get()
if data:
self.error = validate(data) # refactor it to self.validate(data) to be able to get cursor from self.session.
self.queue.task_done()
else:
self.queue.task_done()
break
#self.session.commit()
def treaded():
import threading,Queue
class TreadedBee(Bee,threading.Thread): pass
q = Queue.Queue()
for i in range(Uley): #bees started before data was provided.
bee=TreadedBee(q)
bee.daemon = True
bee.start()
for row in produce_data(): #you dont need to get all data to begin processing, at this place must be cursor of response.
q.put(row)
q.join()
for i in range(Uley):
q.put(None)
def forked():
from multiprocessing import Process,JoinableQueue
class ForkedBee(Bee,Process): pass
q = JoinableQueue()
for i in range(Uley):
bee=ForkedBee(q)
bee.start()
for row in produce_data():
q.put(row)
q.join()
#at this you need to kill zomBee -)
for i in range(Uley):
q.put(None)
q.close()
def pool():
from multiprocessing import Pool
pool = Pool(processes=Uley)
pool.map(validate,produce_data())
if __name__ == "__main__":
import time
s=time.time()
single()
print(time.time()-s) #109
s=time.time()
single()
print(time.time()-s) #6
s=time.time()
treaded()
print(time.time()-s) #12
s=time.time()
forked()
print(time.time()-s) #6
s=time.time()
pool()
print(time.time()-s) #4
test result:
$ python2 tgreads.py
109.779700994
5.84457302094
12.3814198971
5.97618508339
3.69856286049
targeted will flood CPU, memory and you cant provide individual connections to DB, using shared connection is not safe. If want to go in this way - you need to provide output queue and realize collector, that will communicate with DB. pool is short-code and fastest, but not friendly to initiate per-worker connections.

Categories

Resources