How to send a message to process zero, from all other processes?
I'm using mpi4py, with Python 2, and was following this example Parallel programming Research
Why does this line fail, and what fixes it?
searchResult = comm.recv(source=tempRank)
My code below (appears to) works fine until it reaches the line above. I put print statements above and below this line, so I pretty sure this is the problem.
My expectation was ... processor zero will receive a message from each processor, but it does not. The program seems to just hang and do nothing. Here is the program.
import time
from random import randint
from random import shuffle
from mpi4py import MPI
import sys
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = MPI.COMM_WORLD.Get_size()
name = MPI.Get_processor_name()
if rank == 0:
starttime=time.time()
searchResult=False
someNumber = 0
data = list(range(1,8))
chunks = [ [] for _ in range(size) ]
for i,chunk in enumerate(data):
chunks[i % size].append(chunk)
else:
data=None
chunks=None
someNumber=None
# scatter data to all processors
data=comm.scatter(chunks,root=0)
# give another variable to each processor
someNumber = comm.bcast(someNumber,root=0)
print('im rank=',rank,', my data=', data, ' searching for someNumber = ',someNumber)
searchResult=False
for num in data:
if someNumber == num:
print('found the someNumber')
searchResult=True
break
if searchResult == False:
print('someNumber not found')
searchResult=False
# Now, at this point, I want all processors (including processor 0)
# to send processor 0 a message
tempRank=rank
# attempting to send process 0 a message from all other processes
# (does/can processor 0 send itself a message?)
if rank == 0:
print('this line prints one time, and program hangs')
searchResult = comm.recv(source=tempRank)
print('this line never prints, so whats wrong with previous line?')
else:
comm.send(searchResult,dest=0)
if rank == 0:
if searchResult == True:
print('found the someNumber, everyone stop searching .. how to make all processes stop?')
print('elapsedtime = {}'.format(time.time()-starttime))
else:
print('no one found the someNumber')
print('elapsedtime = {}'.format(time.time()-starttime))
Related
So I am currently working on a program that was handed down to me from a previous coworker and I am working through a strange bug. When reading data output from 2 separate serial sources byte by byte, python will write to the same cell in the .csv file as well as the console.
import serial
from datetime import datetime
import os
pressure_passed = False
arduino_passed = False
file_passed = False
BAUD_RATE = 115200
GARBAGE_CYCLES = 3 # how many cycles to ignore before logging data
garbage_cycle = 0
# Save data to log file
def LogData(startTime, pressureData, arduinoData, file):
global garbage_cycle
if garbage_cycle < GARBAGE_CYCLES:
garbage_cycle += 1
else:
delta = datetime.now() - startTime
ms = delta.total_seconds() * 1000
dataString = "{:0.2f}, {}, {}\n".format(ms, pressureData, arduinoData)
file.write(dataString)
file.flush()
print(dataString, end = "")
# Get the COM port for the Mark-10 Series 5
while not pressure_passed:
try:
pressure_com = input("Enter Mark-10 Series 5 COM Port #: ")
pressure_ser = serial.Serial("COM" + str(pressure_com), BAUD_RATE)
pressure_passed = True
except:
print("Invalid COM Port, please enter a valid port.\n-----")
# Get the COM port for the Arduino
while not arduino_passed:
try:
arduino_com = input("Enter Ardunio COM Port #: ")
arduino_ser = serial.Serial("COM" + str(arduino_com), BAUD_RATE)
arduino_passed = True
except:
print("Invalid COM Port, please enter a valid port.\n-----")
# Get the name for the log file
while not file_passed:
try:
file_name = input("Enter log file name: ")
# Add extension if not already given
if "." not in file_name:
file_name += ".csv"
log_file = open(file_name, "a")
# Add header row to log file
if os.stat(log_file.name).st_size == 0:
log_file.write("time (ms), pressure, rate (deg/ms)")
file_passed = True
except:
print("Invalid file, or could not open the file specified.\n-----")
start = datetime.now()
# Variables to read serial input
pressure_data = ""
last_pressure = ""
arduino_data = ""
last_arduino = ""
# Main program loop
# Serial is read from byte by byte to better sync the two devices
while True:
try:
x_changed = False
y_changed = False
# Read from Mark-10 serial if available
# x is a byte read from the serial line, converted to ascii
if pressure_ser.in_waiting > 0:
x = pressure_ser.read().decode('ascii')
x_changed = True
# Read from Arduino serial if available
# y is a byte read from the serial line, converted to ascii
if arduino_ser.in_waiting > 0:
y = arduino_ser.read().decode('ascii')
y_changed = True
# If new data received, check if we should log it
if x_changed:
if x == '\n': # New line detected, log the accumulated data
if last_pressure != pressure_data:
LogData(start, last_pressure, last_arduino, log_file)
last_pressure = pressure_data
pressure_data = ""
elif x != '\r': # Otherwise, add the read character to the string
pressure_data += x
if y_changed:
if y == '\n': # New line detected, log the accumulated data
if last_arduino != arduino_data:
LogData(start, last_pressure, last_arduino, log_file)
last_arduino = arduino_data
arduino_data = ""
elif y != '\r': # Otherwise, add the read character to the string
arduino_data += y
except Exception as e:
print(e)
if arduino_ser.isOpen():
arduino_ser.close()
if pressure_ser.isOpen():
pressure_ser.close()
log_file.close()
break
Here is what the file is spitting out, IE the double printing to a single cell. Sample of the data
Any advice is much appreciated, thank you all!
It looks like when a new pressure is read in, but the value has not changed from last time, then it's not resetting the string that's accumulating all the characters and it doubles up. Then on the next pass, when the REAL pressure hasn't changed, it compares the doubled to the non-doubled and writes again, and vice-versa.
Try unindenting the line that resets the string to remove it from the if clause:
# If new data received, check if we should log it
if x_changed:
if x == '\n': # New line detected, log the accumulated data
if last_pressure != pressure_data:
LogData(start, last_pressure, last_arduino, log_file)
last_pressure = pressure_data
pressure_data = ""
elif x != '\r': # Otherwise, add the read character to the string
pressure_data += x
Then the same thing for the arduino value block.
Your logs will probably be much shorter now.
I like your username! My guess is that it is reading from the serial too quickly and going through the loop twice before the arduino has time to change the value of in_waiting.
At the top of your code add:
import time
And in the LogData function add:
time.sleep(0.1)
Give that a shot and let me know if it helps. 0.1s may be too long for your application but it is a good test to see if this is the issue. If it is, you can play around with the amount of time it sleeps.
Based on the sample output provided, I think it's not writing twice but rather the following specific condition is met occasionally which calls two identical LogData() lines.
Only when Condition 1 AND Condition 2 are met - the data is written "twice". Note that the LogData() call is same in both conditions.
Condition 1:
# If new data received, check if we should log it
if x_changed:
if x == '\n': # New line detected, log the accumulated data
if last_pressure != pressure_data:
LogData(start, last_pressure, last_arduino, log_file)
Condition 2:
if y_changed:
if y == '\n': # New line detected, log the accumulated data
if last_arduino != arduino_data:
LogData(start, last_pressure, last_arduino, log_file)
My main goal is to open 30 child process from the parent process and then open unknown number of new processes from each of those 30 child processes. I am going to call redis for some location data from those new child processes and I am not sure how many times I have to call, it would be 100 or more than 1000. When I am calling more than 1000 times, I am crossing page limit, Error is:
OSError: [Errno 24] Too many open files
I don't want to manually increase the page limit on the production server. I want to put a throttle on the process creation, so that in no way it has more than 1000 connections open.
Here is my template code:
import multiprocessing
import time
from multiprocessing.dummy import Pool
from random import randint
class MultiProcessing():
def second_calculation(self, index1, index2):
random = randint(1, 10)
time.sleep(random)
print("Slept for: {} seconds".format(random))
print("Call done: index: {} | index2: {}".format(index1, index2))
def calculation(self, index):
child_process = list()
random = randint(1, 5)
time.sleep(random)
print("Slept for : {} seconds".format(random))
counter = 0
for i in range(0, 1500):
counter += 1
new_child_process = multiprocessing.Process(target=self.second_calculation, args=(index, counter))
child_process.append(new_child_process)
new_child_process.start()
for process in child_process:
process.join()
print("Request done: {}".format(index))
if __name__ == '__main__':
index = 0
parent_process = list()
m = MultiProcessing()
for i in range(0, 30):
index += 1
print("Index: {}".format(index))
new_process = multiprocessing.Process(target=m.calculation, args=(index,))
parent_process.append(new_process)
new_process.start()
for process in parent_process:
process.join()
Thank you.
I may be approaching this all wrong but still this is where I'm at. I have very large log files I'm trying to search, up to 30gb in some cases. I'm writing a script to pull info and have been playing with multi process to speed it up a bit. right now I'm testing running two functions at the same time to search from the top and bottom to get results, which seems to work. I'm wondering if it's possible to stop one function one a result from the other. Such as if the top function finds a result they both stop. This way I can build it out as needed.
from file_read_backwards import FileReadBackwards
from multiprocessing import Process
import sys
z = "log.log"
#!/usr/bin/env python
rocket = 0
def top():
target = "test"
with open(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
def bottom():
target = "text"
with FileReadBackwards(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
if __name__=='__main__':
p1 = Process(target = top)
p1.start()
p2 = Process(target = bottom)
p2.start()
Here's a proof-of-concept of the approach I mentioned in the comments:
import os
import random
import sys
from multiprocessing import Process, Value
def search(proc_no, file_name, seek_to, max_size, find, flag):
stop_at = seek_to + max_size
with open(file_name) as f:
if seek_to:
f.seek(seek_to - 1)
prev_char = f.read(1)
if prev_char != '\n':
# Landed in the middle of a line. Skip back one (or
# maybe more) lines so this line isn't excluded. Start
# by seeking back 256 bytes, then 512 if necessary, etc.
exponent = 8
pos = seek_to
while pos >= seek_to:
pos = f.seek(max(0, pos - (2 ** exponent)))
f.readline()
pos = f.tell()
exponent += 1
while True:
if flag.value:
break
line = f.readline()
if not line:
break # EOF
data = line.strip()
if data == find:
flag.value = proc_no
print(data)
break
if f.tell() > stop_at:
break
if __name__ == '__main__':
# list.txt contains lines with the numbers 1 to 1000001
file_name = 'list.txt'
info = os.stat(file_name)
file_size = info.st_size
if len(sys.argv) == 1:
# Pick a random value from list.txt
num_lines = 1000001
choices = list(range(1, num_lines + 1))
choices.append('XXX')
find = str(random.choice(choices))
else:
find = sys.argv[1]
num_procs = 4
chunk_size, remainder = divmod(file_size, num_procs)
max_size = chunk_size + remainder
flag = Value('i', 0)
procs = []
print(f'Using {num_procs} processes to look for {find} in {file_name}')
for i in range(num_procs):
seek_to = i * chunk_size
proc = Process(target=search, args=(i + 1, file_name, seek_to, max_size, find, flag))
procs.append(proc)
for proc in procs:
proc.start()
for proc in procs:
proc.join()
if flag.value:
print(find, 'found by proc', flag.value)
else:
print(find, 'not found')
After reading various posts[1] about reading files with multiprocessing and multithreading, it seems that neither is a great approach due to potential disk thrashing and serialized reads. So here's a different, simpler approach that is way faster (at least for the file with a million lines I was trying it out on):
import mmap
import sys
def search_file(file_name, text, encoding='utf-8'):
text = text.encode(encoding)
with open(file_name) as f:
with mmap.mmap(f.fileno(), 0, flags=mmap.ACCESS_READ, prot=mmap.PROT_READ) as m:
index = m.find(text)
if index > -1:
# Found a match; now find beginning of line that
# contains match so we can grab the whole line.
while index > 0:
index -= 1
if m[index] == 10:
index += 1
break
else:
index = 0
m.seek(index)
line = m.readline()
return line.decode(encoding)
if __name__ == '__main__':
file_name, search_string = sys.argv[1:]
line = search_file(file_name, search_string)
sys.stdout.write(line if line is not None else f'Not found in {file_name}: {search_string}\n')
I'm curious how this would perform with a 30GB log file.
[1] Including this one
Simple example using a multiprocessing.Pool and callback function.
Terminates remaining pool processes once a result has returned.
You could add an arbitrary number of processes to search from different offsets in the file using this approach.
import math
import time
from multiprocessing import Pool
from random import random
def search(pid, wait):
"""Sleep for wait seconds, return PID
"""
time.sleep(wait)
return pid
def done(result):
"""Do something with result and stop other processes
"""
print("Process: %d done." % result)
pool.terminate()
print("Terminate Pool")
pool = Pool(2)
pool.apply_async(search, (1, math.ceil(random() * 3)), callback=done)
pool.apply_async(search, (2, math.ceil(random() * 3)), callback=done)
# do other stuff ...
# Wait for result
pool.close()
pool.join() # block our main thread
This is essentially the same as Blurp's answer, but I shortened it and made it a bit to make it more general. As you can see top should be an infinite loop, but bottom stops top immediately.
from multiprocessing import Process
valNotFound = True
def top():
i=0
while ValNotFound:
i += 1
def bottom():
ValNotFound = False
p1 = Process(target = top)
p2 = Process(target = bottom)
p1.start()
p2.start()
I'm trying to get code similar to the following example working correctly:
from multiprocessing import Process, Queue, Manager, Pool
import time
from datetime import datetime
def results_producer(the_work, num_procs):
results = Manager().Queue()
ppool = Pool(num_procs)
multiplier = 3
#step = len(the_work)/(num_procs*multiplier)
step = 100
for i in xrange(0,len(the_work), step):
batch = the_work[i:i+step]
ppool.apply_async(do_work1, args=(i,batch,results))#,callback=results.put_nowait)
return (ppool, results)
def results_consumer(results, total_work, num_procs, pool=None):
current = 0
batch_size=10
total = total_work
est_remaining = 0
while current < total_work:
size = results.qsize()
est_remaining = total_work - (current + size)
if current % 1000 == 0:
print 'Attempting to retrieve item from queue that is empty? %s, with size: %d and remaining work: %d' % (results.empty(), size, est_remaining)
item = results.get()
results.task_done()
current += 1
if current % batch_size == 0 or total_work - current < batch_size:
if pool is not None and est_remaining == 0 and size/num_procs > batch_size:
pool.apply_async(do_work2, args=(current, item, True))
else:
do_work2(current,item, False)
if current % 1000 == 0:
print 'Queue size: %d and remaining work: %d' % (size, est_remaining)
def do_work1(i, w, results):
time.sleep(.05)
if i % 1000 == 0:
print 'did work %d: from %d to %d' % (i,w[0], w[-1])
for j in w:
#create an increasing amount of work on the queue
results.put_nowait(range(j*2))
def do_work2(index, item, in_parallel):
time.sleep(1)
if index % 50 == 0:
print 'processed result %d with length %d in parallel %s' % (index, len(item), in_parallel)
if __name__ == "__main__":
num_workers = 2
start = datetime.now()
print 'Start: %s' % start
amount_work = 4000
the_work = [i for i in xrange(amount_work)]
ppool, results = results_producer(the_work, num_workers)
results_consumer(results, len(the_work), num_workers, ppool)
if ppool is not None:
ppool.close()
ppool.join()
print 'Took: %s time' % (datetime.now() - start)
And it deadlocks on the results.put_nowait call from do_work1 even though the queue is empty! Sometimes the code is able to put all the work on the queue but the results.get call from results_consumer blocks since it is apparently empty even though the work has not been consumed yet.
Additionally, I checked the programming guidelines: https://docs.python.org/2/library/multiprocessing.html and believe the above code conforms to it. Lastly the problem in this post: Python multiprocessing.Queue deadlocks on put and get seems very similar and claims to be solved on Windows (I'm running this on Windows 8.1) however the above code doesn't block due to the parent process attempting to join the child process since the logic is similar to the suggested answer. Any suggestions about the cause of the deadlock and how to fix it? Also in general, what is the best way to enable multiple producers to provide results for a consumer to process in python?
I have a python script which validates data fetched from some rows in a database and then logs the errors in a different table in the same database.
The script validates each row and marks it as validated & has error = True/False depending on the validation outcome. This process is repeated for each row. With that, I thought I'd add some steroids by creating threads such that the validation for each row is done by independent threads thus reducing the time it takes to validate a batch of rows.
To my surprise, I find that the threaded script is taking slightly longer than the non-threaded one. On average to validate 1502 rows of data it takes the Non-Threaded script 1.5 seconds while the threaded script takes 2.27 seconds. That might not be much but ideally I'll be running through 2 million records at a go so that time overhead will be significant. That plus I would assume that threaded apps would finish faster! :-)
The two scripts clock the same time of about 0.01 seconds upto the point of creating threads. By this point the SQLAlchemy session is created and all the data to be validated and relations i.e foreign keys etc are fetched. From there though, the non-threaded script finishes faster. Below is my code.
1.0 None-Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
no_of_records += 1
validator = Validate()
validator.validator(no_of_records, data_list)
record_counter += lf_detail_counter
data_list = None
no_of_records = 0
print("Validated '%s': seconds %s" %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Reset counters
lf_detail_counter = 0
lf_counter += 1
#Commit The Entire Transaction.
session.commit()
print("NoThread:Finished validating %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()- process_start_time))
1.1. Validation Function for Non-Threaded Script
class Validate():
has_error = None
def validator(self, loop_length, job):
'''Validate data'''
for row_counter in range(loop_length):
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job[row_counter]
error_detail = ErrorLogDetail()
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 2"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
else:
#Continue with extra validation
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
2.0 Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
queue_size += 1
if queue_size == THREAD_LIMIT:
myqueuing(queue_size, data_list)
queue_size = 0
#spawn a pool of threads, and pass them queue instance
if queue_size > 0:
myqueuing(queue_size, data_list)
#Keep record of rows processed
record_counter += lf_detail_counter
print("Validated '%s': seconds- %s " %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Commit The Entire Transaction.
session.commit()
#Reset counters
lf_detail_counter = 0
lf_counter += 1
data_list = None
queue_size = 0
print("HasThread:Finished loading %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()-process_start_time)) #print time it took to run'
2.1. Threaded Validation Function
THREAD_LIMIT = 50 # This is how many threads we want
jobs = queue.Queue() # This sets up the queue object to use 5 slots
singlelock = threading.Lock() # This is a lock so threads don't print trough each other (and other reasons)
def myqueuing(queuesize, data):
'''Put the fetched data in a queue and instantiate threads to
process the queue'''
# Spawn the threads
is_valid_date("20131212", True) #Calling this here to avoid a bug in time.striptime() when threading
for x in range(queuesize):
# This is the thread class that we instantiate.
workerbee().start()
# Put stuff in queue
for i in range(queuesize):
# Block if queue is full, and wait 2 seconds. After 5s raise Queue Full error.
try:
jobs.put(data[i], block=True, timeout=2)
except:
singlelock.acquire()
print ("The queue is full !")
singlelock.lock.release()
# Wait for the threads to finish
singlelock.acquire() # Acquire the lock so we can print
print ("Waiting for threads to finish.")
singlelock.release() # Release the lock
jobs.join() # This command waits for all threads to finish.
class workerbee(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.lock = threading.Lock()
self.has_error = False
def run(self):
#try:
job = jobs.get(True,1)
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job
'''Validates the data.'''
error_detail = ErrorLogDetail()
#Again please note that this part is identical for both the non-threaded and the threaded script.
#After each pass on a record, the record is marked as validated and if has_error = True
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message2")
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
else:
#Continue with further validation - about 5 other validation checks
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
3.0. Common function for setting validation in both threaded and non-threaded
def set_validation(self, load_file_detail, load_file_detail_id, has_error, can_be_loaded):
'''Mark the record as having been validated and whether has error = True or False'''
#print("haserror and canbeloaded ", has_error, can_be_loaded)
is_done = load_file_detail.set_validation_and_error(load_file_detail_id, True, has_error, can_be_loaded)
if is_done == False:
raise Exception ("Can't update load_file_detail's is_validated parameter: ", load_file_detail_id)
3.1. Actual SQLAlchemy session for saving the validation status
def set_validation_and_error(self, load_file_detail_id, is_validated, has_error, can_be_loaded):
result = session.execute('UPDATE load_file_detail SET is_validated=%s, has_error=%s, can_be_loaded=%s WHERE id=%s' \
%(is_validated, has_error, can_be_loaded, load_file_detail_id))
So, the fetching of data to be validated is the same and both scripts take same amount of time up to that point. The validation process is the same for both scripts and saving to DB is the same i.e. Section 3.0 and 3.1 are shared by both scripts. The only difference is the validation with multiple threads. So am thinking maybe there is something about the multiple threads and SQLAlchemy that is making the app slower in threaded mode? Have I implemented the threaded function in the proper way? One of those or threading is not suitable in this scenario. Suggestions welcome.
You must create Queue for logging and add "logger" thread. So you remove locks code must be faster.
Also create DB connections in each thread to be able to get data in parallel.
Treads parallelize only C-library calls because of GIL.
For parallelize python code You must use multiprocessing.
I write test for You, describing how to process iterable:
def produce_data(amount=100000, invalid=1, valid=10):
# produce_data = sql('request').getall()
import random
id = 0
data = [True]*valid + [False]*invalid
while id < amount:
id+=1
yield (id,random.choice(data))
def validate(row):
if row[1]:
time.sleep(0.001) #set valid sql request emulation.
return True
else:
time.sleep(0.001) #set invalid sql request emulation.
return False
def single():
for row in produce_data():
validate(row)
def targeted():
import threading
for row in produce_data():
threading.Thread(target=validate,args=(row,))
Uley = 50
class Bee(object):
error=False
running = True
def __init__(self,queue,*args,**kwargs):
self.queue=queue #dont use any global variable!
# every bee must have unique db connection and session.
#self.session = db.connection().session()
# initialize it there.
return super(Bee,self).__init__(*args,**kwargs)
def run(self):
while self.running:
data=self.queue.get()
if data:
self.error = validate(data) # refactor it to self.validate(data) to be able to get cursor from self.session.
self.queue.task_done()
else:
self.queue.task_done()
break
#self.session.commit()
def treaded():
import threading,Queue
class TreadedBee(Bee,threading.Thread): pass
q = Queue.Queue()
for i in range(Uley): #bees started before data was provided.
bee=TreadedBee(q)
bee.daemon = True
bee.start()
for row in produce_data(): #you dont need to get all data to begin processing, at this place must be cursor of response.
q.put(row)
q.join()
for i in range(Uley):
q.put(None)
def forked():
from multiprocessing import Process,JoinableQueue
class ForkedBee(Bee,Process): pass
q = JoinableQueue()
for i in range(Uley):
bee=ForkedBee(q)
bee.start()
for row in produce_data():
q.put(row)
q.join()
#at this you need to kill zomBee -)
for i in range(Uley):
q.put(None)
q.close()
def pool():
from multiprocessing import Pool
pool = Pool(processes=Uley)
pool.map(validate,produce_data())
if __name__ == "__main__":
import time
s=time.time()
single()
print(time.time()-s) #109
s=time.time()
single()
print(time.time()-s) #6
s=time.time()
treaded()
print(time.time()-s) #12
s=time.time()
forked()
print(time.time()-s) #6
s=time.time()
pool()
print(time.time()-s) #4
test result:
$ python2 tgreads.py
109.779700994
5.84457302094
12.3814198971
5.97618508339
3.69856286049
targeted will flood CPU, memory and you cant provide individual connections to DB, using shared connection is not safe. If want to go in this way - you need to provide output queue and realize collector, that will communicate with DB. pool is short-code and fastest, but not friendly to initiate per-worker connections.