Multiprocessing append to pool without map - python

I have a function to encrypt a bunch of strings to md5 and inside of it, I have a pool which I create.
Main.py
config = ConfigParser()
config.read("config.ini")
possibleCharacters = "abcd"
def mapped_loop_digit(args):
loop_digit(*args, is_pool=True)
def loop_digit(current_str, place, strings, hashes, is_outer=False, is_pool=False):
if place == config.getint("string_creation", "length_for_new_process"):
current_strings = list()
for character in possibleCharacters:
current_str[place] = character
if is_outer and config.getboolean("development", "minor_logging"):
print("Outer character maker at", possibleCharacters.index(character) + 1, "in", len(possibleCharacters))
elif is_pool and config.getboolean("development", "pool_minor_logging"):
print("Outest in pool character maker for process", multiprocessing.current_process()._identity[0],
"at", possibleCharacters.index(character) + 1, "in", len(possibleCharacters), "with character as",
str(character) + ". Current string is", current_str)
if place == 0:
string = "".join(_character for _character in current_str)
hashes.append(hashlib.md5(string.encode()).hexdigest())
strings.append(string)
elif place == config.getint("string_creation", "length_for_new_process"):
current_strings.append(current_str.copy())
else:
loop_digit(current_str, place - 1, strings, hashes)
if place == config.getint("string_creation", "length_for_new_process"):
args = list()
print("Starting a new pool")
for string in current_strings:
args.append([string, place - 1, strings, hashes])
with multiprocessing.Pool(processes=config.getint("string_creation", "processes")) as pool:
pool.map(mapped_loop_digit, args)
pool.close()
pool.join()
manager = multiprocessing.Manager()
all_strings = manager.list("")
all_hashes = manager.list("")
loop_digit(["", "", "", ""], 4 - 1, all_strings, all_hashes, is_outer=True)
config.ini
[development]
minor_logging = 1
pool_minor_logging = 1
[string_creation]
processes = 3
length_for_new_process = 3
At the moment I have a list called current_strings and append to it in the middle of the program, then at the end, I loop through it and create a list of the arguments to then map it to a separate function and then run the original function again. Is there an easier way to do this so I can just append to the pool instead of the list.

If you create Pool as
pool = multiprocessing.Pool(5)
without pool.close() pool.join() then you can use pool many times in different places (in different functions).
If you use map_async() instead of map() then you don't have to wait for the end of processes and you can add more processes using next map_async() and pool will manage all processes together.
You can also use apply_async to add single proces to existing pool.
Because map_async and apply_async doesn't wait for end of processses so you have to control it using wait() before exit program
it1 = pool.map_async(...)
it2 = pool.map_async(...)
it3 = pool.apply_async(...)
# ... code ...
it1.wait()
it2.wait()
it3.wait()
or you have to use (both) at the end
pool.close()
pool.join()
If you don't use it then program may exit before processes will be finished and it will terminate them.
Minimal working example
import multiprocessing
import time
def fun(number):
for x in range(3):
time.sleep(.2)
print(number, 'loop:', x)
if __name__ == '__main__':
pool = multiprocessing.Pool(2)
print("map [1,2,3]")
it1 = pool.map_async(fun, [1,2,3])
print("map ['A', 'B', 'C']")
it2 = pool.map_async(fun, ['A', 'B', 'C'])
print("single work X")
it3 = pool.apply_async(fun, 'X')
print("single work Y")
it4 = pool.apply_async(fun, 'Y')
# wait for the end of processes
print('wait for the end of processes')
#it1.wait()
#it2.wait()
#it3.wait()
#it4.wait()
pool.close()
pool.join()
print('exit')

Related

Multiprocessing a for loop - got errors

I have some code in Python and I wanna do it with multiprocessing
import multiprocessing as mp
from multiprocessing.sharedctypes import Value
import time
import math
resault_a = []
resault_b = []
resault_c = []
def make_calculation_one(numbers):
for number in numbers:
resault_a.append(math.sqrt(number**3))
def make_calculation_two(numbers):
for number in numbers:
resault_a.append(math.sqrt(number**4))
def make_calculation_three(numbers):
for number in numbers:
resault_c.append(math.sqrt(number**5))
number_list = list(range(1000000))
if __name__ == "__main__":
mp.set_start_method("fork")
p1 = mp.Process(target=make_calculation_one, args=(number_list))
p2 = mp.Process(target=make_calculation_two, args=(number_list))
p3 = mp.Process(target=make_calculation_three, args=(number_list))
start = time.time()
p1.start()
p2.start()
p3.start()
end = time.time()
print(end - start)
I got an empty array, where is the problem?
I got some errors:
"Process Process-1:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()"
How can I fix it?
TNX
There are several issues with your code:
The major problem is that the args argument to the Process initializer requires a tuple or list. You are specifying args=(number_list). The parentheses around number_list does not make this a tuple. Without the comma you just have a parenthesized expression, i.e. a list. So instead of passing a single argument that is a list, you are passing 10,000 arguments, while your "worker" functions only take 1 argument. You need: args=(number_list,).
Your worker functions are doing calculations but neither printing nor returning the results of these calculations. Assuming you want to return the results, you need a mechanism for doing so. If you are using multiprocessing.Process then the usual solution is to pass to the worker function a multiprocessing.Queue instance to which the worker function can put the results (see below). You can also use a multiprocessing pool (also see below).
Your timing is not quite right. You have started the child processes and immediately set end without waiting for the tasks to complete. To get the actual time, end should only be set when the child processes have finished creating their results.
Using Process with queues
import multiprocessing as mp
import time
import math
def make_calculation_one(numbers, out_q):
out_q.put([math.sqrt(number**3) for number in numbers])
def make_calculation_two(numbers, out_q):
out_q.put([math.sqrt(number**4) for number in numbers])
def make_calculation_three(numbers, out_q):
out_q.put([math.sqrt(number**5) for number in numbers])
if __name__ == "__main__":
# We only want one copy of `number_list`, i.e. in our main process.
# But there is actually no need to convert to a list:
number_list = range(1000000)
mp.set_start_method("fork")
out_q_1 = mp.Queue()
out_q_2 = mp.Queue()
out_q_3 = mp.Queue()
# Create pool of size 3:
p1 = mp.Process(target=make_calculation_one, args=(number_list, out_q_1))
p2 = mp.Process(target=make_calculation_two, args=(number_list, out_q_2))
p3 = mp.Process(target=make_calculation_three, args=(number_list, out_q_3))
start = time.time()
p1.start()
p2.start()
p3.start()
results = []
# Get return values:
results.append(out_q_1.get())
results.append(out_q_2.get())
results.append(out_q_3.get())
end = time.time()
p1.join()
p2.join()
p3.join()
print(end - start)
Using a shared memory array to pass the number list and to return the results
import multiprocessing as mp
import time
import math
def make_calculation_one(numbers, results):
for idx, number in enumerate(numbers):
results[idx] = math.sqrt(number**3)
def make_calculation_two(numbers, results):
for idx, number in enumerate(numbers):
results[idx] = math.sqrt(number**4)
def make_calculation_three(numbers, results):
for idx, number in enumerate(numbers):
results[idx] = math.sqrt(number**5)
if __name__ == "__main__":
# We only want one copy of `number_list`, i.e. in our main process
number_list = mp.RawArray('d', range(1000000))
mp.set_start_method("fork")
results_1 = mp.RawArray('d', len(number_list))
results_2 = mp.RawArray('d', len(number_list))
results_3 = mp.RawArray('d', len(number_list))
# Create pool of size 3:
p1 = mp.Process(target=make_calculation_one, args=(number_list, results_1))
p2 = mp.Process(target=make_calculation_two, args=(number_list, results_2))
p3 = mp.Process(target=make_calculation_three, args=(number_list, results_3))
start = time.time()
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
end = time.time()
print(end - start)
Using a multiprocessing pool
import multiprocessing as mp
import time
import math
def make_calculation_one(numbers):
return [math.sqrt(number**3) for number in numbers]
def make_calculation_two(numbers):
return [math.sqrt(number**4) for number in numbers]
def make_calculation_three(numbers):
return [math.sqrt(number**5) for number in numbers]
if __name__ == "__main__":
# We only want one copy of `number_list`, i.e. in our main process
number_list = range(1000000)
mp.set_start_method("fork")
# Create pool of size 3:
pool = mp.Pool(3)
start = time.time()
async_results = []
async_results.append(pool.apply_async(make_calculation_one, args=(number_list,)))
async_results.append(pool.apply_async(make_calculation_two, args=(number_list,)))
async_results.append(pool.apply_async(make_calculation_three, args=(number_list,)))
# Now wait for results:
results = [async_result.get() for async_result in async_results]
end = time.time()
pool.close()
pool.join()
print(end - start)
Conclusion
Since your calculations yield a type readily supported by shared memory, the second code example above should result in the best performance. You could also adapt the multiprocessing pool example to use shared memory.
I'm getting some other error:
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
TypeError: make_calculation_one() takes 1 positional argument but 1000000 were given
but if I change these line accordingly then it works:
p1 = mp.Process(target=make_calculation_one, args=([number_list]))
p2 = mp.Process(target=make_calculation_two, args=([number_list]))
p3 = mp.Process(target=make_calculation_three, args=([number_list]))
The function that is run in a worker Process cannot access data in the parent process.
If the "fork" start method is used, it would have access to the copy of that data in the forked process.
But modifying that would not alter the value in the parent process.
In this case, the easiest thing to do it to create a multiprocessing.Array, and pass that to the process to use.
import math
import multiprocessing as mp
def make_calculation_one(numbers, res):
for idx, number in enumerate(numbers):
res[idx] = math.sqrt(number**3)
number_list = list(range(10000))
if __name__ == "__main__":
result_a = mp.Array("d", len(number_list))
p1 = mp.Process(target=make_calculation_one, args=(number_list, result_a))
p1.start()
p1.join()
print(sum(result_a))
This code prints the value 3999500012.4745193.

how do I efficiently retrieve return value from function executed with multiprocessing?

I have multidimensional array which needs to be calculated with an imported function. (I am using jupyter notebook, so I exported the function to ipynb and imported it again) The function takes argument of 1 dimensional array.
#Function
def calculatespi(datagrid,q):
date_time = datagrid['time'][:]
gridvalue = datagrid.values
if np.isnan(np.sum(gridvalue)) != True:
df_precip = pd.DataFrame({"Date": date_time,"precip":gridvalue})
spi_prc = spi.SPI()
spi3_grid = spi_prc.calculate(df_precip, 'Date', 'precip', freq = 'M', scale = 3, fit_type ="lmom", dist_type="gam")
spi3 = spi3_grid['precip_scale_3_calculated_index'].values
else:
spi3 = np.empty((489))
spi3[:] = np.nan
q.put(spi3)
#Main Notebook
if name == "main":
spipi = []
processes = []
for x in range (3):
for y in range(3):
q = multiprocessing.Queue()
p = multiprocessing.Process(target=calculatespi, args= (prcoba[:,x,y],q))
p.start()
processes.append(p)
spipi.append(q.get())
for process in processes:
process.join()
After hundreds of attempt, finally I can retrieve the results from my problem but it took times longer than running it without using multiprocessing. What should I do?
Using concurrent.futures.ProcessPoolExecutor makes things much easier.
First, replace in calculatespi the q.put(spi3) by return spi3 and remove the q parameter. Then the "main" code can be written as
#Main Notebook
if name == "main":
from concurrent.futures import ProcessPoolExecutor
args = []
for x in range (3):
for y in range(3):
args.append(prcoba[:,x,y])
with ProcessPoolExecutor() as executor:
spipi = list(executor.map(calculatespi, args))
The executor takes care about everything else.

Why I am getting error at the time of parallel processing

I am passing the key and value of a dictionary for parallel processing
if __name__ == "__main__":
DATASETS = {
"Dataset_1": data_preprocess.dataset_1,
"Dataset_2": data_preprocess.dataset_2,}
pool = mp.Pool(8)
pool.starmap(main, zip(DATASETS.keys(), DATASETS.values()))
pool.close()
# As I am not joining any result and I am directly saving the output
# in CSV file from (main function) I did not used pool.join()
The main function
def main(dataset_name, generate_dataset):
REGRESSORS = {
"LinReg": LinearRegression(),
"Lasso": Lasso(),}
ROOT = Path(__file__).resolve().parent
dataset_name = dataset_name
generate_dataset = generate_dataset
dfs = []
for reg_name, regressor in REGRESSORS.items():
df = function_calling(
generate_dataset=generate_dataset,
regressor=regressor,
reg_name=reg_name,)
print(df)
dfs.append(df)
df = pd.concat(dfs, axis=0, ignore_index=True)
filename = dataset_name + "_result.csv"
outfile = str(PATH) + "/" + filename
df.to_csv(outfile)
I am getting an error AssertionError: daemonic processes are not allowed to have children.
Could you tell me why I am getting the error? How can I resolve this?
To just create your own Process instances:
import multiprocessing as mp
def main(dataset_name, generate_dataset):
print(dataset_name, generate_dataset, flush=True)
... # etc.
if __name__ == "__main__":
DATASETS = {
"Dataset_1": 1,
"Dataset_2": 2,}
processes = [mp.Process(target=main, args=(k, v)) for k, v in DATASETS.items()]
for process in processes:
process.start()
# wait for termination:
for process in processes:
process.join
Prints:
Dataset_1 1
Dataset_2 2
The issue is suppose you have 8 CPU cores and DATASETS had 100 key/value pairs. You would be creating 100 processes. Assuming these processes were CPU-intensive, you could not expect more than 8 of them to really be doing anything productive. Yet you incurred the CPU and storage overhead of having created all those processes. But as long as the number of processes you will be creating are not excessively greater than the number of CPU cores you have and your function main does not need to return a value back to your main process, this should be OK.
There is also a way of implementing your own multiprocessing pool with these Process instances and a Queue instance, but that's a bit more complicated:
import multiprocessing as mp
def main(dataset_name, generate_dataset):
print(dataset_name, generate_dataset, flush=True)
... # etc.
def worker(queue):
while True:
arg = queue.get()
if arg is None:
# signal to terminate
break
# unpack
dataset_name, generate_dataset = arg
main(dataset_name, generate_dataset)
if __name__ == "__main__":
DATASETS = {
"Dataset_1": 1,
"Dataset_2": 2,}
queue = mp.Queue()
items = list(DATASETS.items())
for k, v in items:
# put the arguments on the queue
queue.put((k, v))
# number of processors we will be using:
n_processors = min(mp.cpu_count(), len(items))
for _ in range(n_processors):
# special value to tell main there is no nore work: one for each task
queue.put(None)
processes = [mp.Process(target=worker, args=(queue,)) for _ in range(n_processors)]
for process in processes:
process.start()
for process in processes:
process.join

python multiprocessing to create an excel file with multiple sheets [duplicate]

I am new to Python and I am trying to save the results of five different processes to one excel file (each process write to a different sheet). I have read different posts here, but still can't get it done as I'm very confused about pool.map, queues, and locks, and I'm not sure what is required here to fulfill this task.
This is my code so far:
list_of_days = ["2017.03.20", "2017.03.21", "2017.03.22", "2017.03.23", "2017.03.24"]
results = pd.DataFrame()
if __name__ == '__main__':
global list_of_days
writer = pd.ExcelWriter('myfile.xlsx', engine='xlsxwriter')
nr_of_cores = multiprocessing.cpu_count()
l = multiprocessing.Lock()
pool = multiprocessing.Pool(processes=nr_of_cores, initializer=init, initargs=(l,))
pool.map(f, range(len(list_of_days)))
pool.close()
pool.join()
def init(l):
global lock
lock = l
def f(k):
global results
*** DO SOME STUFF HERE***
results = results[ *** finished pandas dataframe *** ]
lock.acquire()
results.to_excel(writer, sheet_name=list_of_days[k])
writer.save()
lock.release()
The result is that only one sheet gets created in excel (I assume it is the process finishing last). Some questions about this code:
How to avoid defining global variables?
Is it even possible to pass around dataframes?
Should I move the locking to main instead?
Really appreciate some input here, as I consider mastering multiprocessing as instrumental. Thanks
1) Why did you implement time.sleep in several places in your 2nd method?
In __main__, time.sleep(0.1), to give the started process a timeslice to startup.
In f2(fq, q), to give the queue a timeslice to flushed all buffered data to the pipe and
as q.get_nowait() are used.
In w(q), are only for testing simulating long run of writer.to_excel(...),
i removed this one.
2) What is the difference between pool.map and pool = [mp.Process( . )]?
Using pool.map needs no Queue, no parameter passed, shorter code.
The worker_process have to return immediately the result and terminates.
pool.map starts a new process as long as all iteration are done.
The results have to be processed after that.
Using pool = [mp.Process( . )], starts n processes.
A process terminates on queue.Empty
Can you think of a situation where you would prefer one method over the other?
Methode 1: Quick setup, serialized, only interested in the result to continue.
Methode 2: If you want to do all workload parallel.
You could't use global writer in processes.
The writer instance has to belong to one process.
Usage of mp.Pool, for instance:
def f1(k):
# *** DO SOME STUFF HERE***
results = pd.DataFrame(df_)
return results
if __name__ == '__main__':
pool = mp.Pool()
results = pool.map(f1, range(len(list_of_days)))
writer = pd.ExcelWriter('../test/myfile.xlsx', engine='xlsxwriter')
for k, result in enumerate(results):
result.to_excel(writer, sheet_name=list_of_days[k])
writer.save()
pool.close()
This leads to .to_excel(...) are called in sequence in the __main__ process.
If you want parallel .to_excel(...) you have to use mp.Queue().
For instance:
The worker process:
# mp.Queue exeptions have to load from
try:
# Python3
import queue
except:
# Python 2
import Queue as queue
def f2(fq, q):
while True:
try:
k = fq.get_nowait()
except queue.Empty:
exit(0)
# *** DO SOME STUFF HERE***
results = pd.DataFrame(df_)
q.put( (list_of_days[k], results) )
time.sleep(0.1)
The writer process:
def w(q):
writer = pd.ExcelWriter('myfile.xlsx', engine='xlsxwriter')
while True:
try:
titel, result = q.get()
except ValueError:
writer.save()
exit(0)
result.to_excel(writer, sheet_name=titel)
The __main__ process:
if __name__ == '__main__':
w_q = mp.Queue()
w_p = mp.Process(target=w, args=(w_q,))
w_p.start()
time.sleep(0.1)
f_q = mp.Queue()
for i in range(len(list_of_days)):
f_q.put(i)
pool = [mp.Process(target=f2, args=(f_q, w_q,)) for p in range(os.cpu_count())]
for p in pool:
p.start()
time.sleep(0.1)
for p in pool:
p.join()
w_q.put('STOP')
w_p.join()
Tested with Python:3.4.2 - pandas:0.19.2 - xlsxwriter:0.9.6

Multiprocessing hangs after several hundred jobs

I am trying to use this question for my file processing:
Python multiprocessing safely writing to a file
This is my modification of the code:
def listener(q):
'''listens for messages on the q, writes to file. '''
while 1:
reads = q.get()
if reads == 'kill':
#f.write('killed')
break
for read in reads:
out_bam.write(read)
out_bam.flush()
out_bam.close()
def fetch_reads(line, q):
parts = line[:-1].split('\t')
print(parts)
start,end = int(parts[1])-1,int(parts[2])-1
in_bam = pysam.AlignmentFile(args.bam, mode='rb')
fetched = in_bam.fetch(parts[0], start, end)
reads = [read for read in fetched if (read.cigarstring and read.pos >= start and read.pos < end and 'S' not in read.cigarstring)]
in_bam.close()
q.put(reads)
return reads
#must use Manager queue here, or will not work
manager = mp.Manager()
q = manager.Queue()
if not args.threads:
threads = 1
else:
threads = int(args.threads)
pool = mp.Pool(threads+1)
#put listener to work first
watcher = pool.apply_async(listener, (q,))
with open(args.bed,'r') as bed:
jobs = []
cnt = 0
for line in bed:
# Fire off the read fetchings
job = pool.apply_async(fetch_reads, (line, q))
jobs.append(job)
cnt += 1
if cnt > 10000:
break
# collect results from the workers through the pool result queue
for job in jobs:
job.get()
print('get')
#now we are done, kill the listener
q.put('kill')
pool.close()
The differences in that I am opening and closing the file in the function since otherwise I get unusual errors from bgzip.
At first, print(parts) and print('get') are interchangeably printed (more or less), then there are less and less prints of 'get'. Ultimately the code hangs, and nothing is printed (all the parts are printed, but 'get' simply doesn't print anymore). The output file remains zero bytes.
Can anyone lend a hand? Cheers!

Categories

Resources