I have some code in Python and I wanna do it with multiprocessing
import multiprocessing as mp
from multiprocessing.sharedctypes import Value
import time
import math
resault_a = []
resault_b = []
resault_c = []
def make_calculation_one(numbers):
for number in numbers:
resault_a.append(math.sqrt(number**3))
def make_calculation_two(numbers):
for number in numbers:
resault_a.append(math.sqrt(number**4))
def make_calculation_three(numbers):
for number in numbers:
resault_c.append(math.sqrt(number**5))
number_list = list(range(1000000))
if __name__ == "__main__":
mp.set_start_method("fork")
p1 = mp.Process(target=make_calculation_one, args=(number_list))
p2 = mp.Process(target=make_calculation_two, args=(number_list))
p3 = mp.Process(target=make_calculation_three, args=(number_list))
start = time.time()
p1.start()
p2.start()
p3.start()
end = time.time()
print(end - start)
I got an empty array, where is the problem?
I got some errors:
"Process Process-1:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()"
How can I fix it?
TNX
There are several issues with your code:
The major problem is that the args argument to the Process initializer requires a tuple or list. You are specifying args=(number_list). The parentheses around number_list does not make this a tuple. Without the comma you just have a parenthesized expression, i.e. a list. So instead of passing a single argument that is a list, you are passing 10,000 arguments, while your "worker" functions only take 1 argument. You need: args=(number_list,).
Your worker functions are doing calculations but neither printing nor returning the results of these calculations. Assuming you want to return the results, you need a mechanism for doing so. If you are using multiprocessing.Process then the usual solution is to pass to the worker function a multiprocessing.Queue instance to which the worker function can put the results (see below). You can also use a multiprocessing pool (also see below).
Your timing is not quite right. You have started the child processes and immediately set end without waiting for the tasks to complete. To get the actual time, end should only be set when the child processes have finished creating their results.
Using Process with queues
import multiprocessing as mp
import time
import math
def make_calculation_one(numbers, out_q):
out_q.put([math.sqrt(number**3) for number in numbers])
def make_calculation_two(numbers, out_q):
out_q.put([math.sqrt(number**4) for number in numbers])
def make_calculation_three(numbers, out_q):
out_q.put([math.sqrt(number**5) for number in numbers])
if __name__ == "__main__":
# We only want one copy of `number_list`, i.e. in our main process.
# But there is actually no need to convert to a list:
number_list = range(1000000)
mp.set_start_method("fork")
out_q_1 = mp.Queue()
out_q_2 = mp.Queue()
out_q_3 = mp.Queue()
# Create pool of size 3:
p1 = mp.Process(target=make_calculation_one, args=(number_list, out_q_1))
p2 = mp.Process(target=make_calculation_two, args=(number_list, out_q_2))
p3 = mp.Process(target=make_calculation_three, args=(number_list, out_q_3))
start = time.time()
p1.start()
p2.start()
p3.start()
results = []
# Get return values:
results.append(out_q_1.get())
results.append(out_q_2.get())
results.append(out_q_3.get())
end = time.time()
p1.join()
p2.join()
p3.join()
print(end - start)
Using a shared memory array to pass the number list and to return the results
import multiprocessing as mp
import time
import math
def make_calculation_one(numbers, results):
for idx, number in enumerate(numbers):
results[idx] = math.sqrt(number**3)
def make_calculation_two(numbers, results):
for idx, number in enumerate(numbers):
results[idx] = math.sqrt(number**4)
def make_calculation_three(numbers, results):
for idx, number in enumerate(numbers):
results[idx] = math.sqrt(number**5)
if __name__ == "__main__":
# We only want one copy of `number_list`, i.e. in our main process
number_list = mp.RawArray('d', range(1000000))
mp.set_start_method("fork")
results_1 = mp.RawArray('d', len(number_list))
results_2 = mp.RawArray('d', len(number_list))
results_3 = mp.RawArray('d', len(number_list))
# Create pool of size 3:
p1 = mp.Process(target=make_calculation_one, args=(number_list, results_1))
p2 = mp.Process(target=make_calculation_two, args=(number_list, results_2))
p3 = mp.Process(target=make_calculation_three, args=(number_list, results_3))
start = time.time()
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()
end = time.time()
print(end - start)
Using a multiprocessing pool
import multiprocessing as mp
import time
import math
def make_calculation_one(numbers):
return [math.sqrt(number**3) for number in numbers]
def make_calculation_two(numbers):
return [math.sqrt(number**4) for number in numbers]
def make_calculation_three(numbers):
return [math.sqrt(number**5) for number in numbers]
if __name__ == "__main__":
# We only want one copy of `number_list`, i.e. in our main process
number_list = range(1000000)
mp.set_start_method("fork")
# Create pool of size 3:
pool = mp.Pool(3)
start = time.time()
async_results = []
async_results.append(pool.apply_async(make_calculation_one, args=(number_list,)))
async_results.append(pool.apply_async(make_calculation_two, args=(number_list,)))
async_results.append(pool.apply_async(make_calculation_three, args=(number_list,)))
# Now wait for results:
results = [async_result.get() for async_result in async_results]
end = time.time()
pool.close()
pool.join()
print(end - start)
Conclusion
Since your calculations yield a type readily supported by shared memory, the second code example above should result in the best performance. You could also adapt the multiprocessing pool example to use shared memory.
I'm getting some other error:
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
TypeError: make_calculation_one() takes 1 positional argument but 1000000 were given
but if I change these line accordingly then it works:
p1 = mp.Process(target=make_calculation_one, args=([number_list]))
p2 = mp.Process(target=make_calculation_two, args=([number_list]))
p3 = mp.Process(target=make_calculation_three, args=([number_list]))
The function that is run in a worker Process cannot access data in the parent process.
If the "fork" start method is used, it would have access to the copy of that data in the forked process.
But modifying that would not alter the value in the parent process.
In this case, the easiest thing to do it to create a multiprocessing.Array, and pass that to the process to use.
import math
import multiprocessing as mp
def make_calculation_one(numbers, res):
for idx, number in enumerate(numbers):
res[idx] = math.sqrt(number**3)
number_list = list(range(10000))
if __name__ == "__main__":
result_a = mp.Array("d", len(number_list))
p1 = mp.Process(target=make_calculation_one, args=(number_list, result_a))
p1.start()
p1.join()
print(sum(result_a))
This code prints the value 3999500012.4745193.
Related
I have multidimensional array which needs to be calculated with an imported function. (I am using jupyter notebook, so I exported the function to ipynb and imported it again) The function takes argument of 1 dimensional array.
#Function
def calculatespi(datagrid,q):
date_time = datagrid['time'][:]
gridvalue = datagrid.values
if np.isnan(np.sum(gridvalue)) != True:
df_precip = pd.DataFrame({"Date": date_time,"precip":gridvalue})
spi_prc = spi.SPI()
spi3_grid = spi_prc.calculate(df_precip, 'Date', 'precip', freq = 'M', scale = 3, fit_type ="lmom", dist_type="gam")
spi3 = spi3_grid['precip_scale_3_calculated_index'].values
else:
spi3 = np.empty((489))
spi3[:] = np.nan
q.put(spi3)
#Main Notebook
if name == "main":
spipi = []
processes = []
for x in range (3):
for y in range(3):
q = multiprocessing.Queue()
p = multiprocessing.Process(target=calculatespi, args= (prcoba[:,x,y],q))
p.start()
processes.append(p)
spipi.append(q.get())
for process in processes:
process.join()
After hundreds of attempt, finally I can retrieve the results from my problem but it took times longer than running it without using multiprocessing. What should I do?
Using concurrent.futures.ProcessPoolExecutor makes things much easier.
First, replace in calculatespi the q.put(spi3) by return spi3 and remove the q parameter. Then the "main" code can be written as
#Main Notebook
if name == "main":
from concurrent.futures import ProcessPoolExecutor
args = []
for x in range (3):
for y in range(3):
args.append(prcoba[:,x,y])
with ProcessPoolExecutor() as executor:
spipi = list(executor.map(calculatespi, args))
The executor takes care about everything else.
I am passing the key and value of a dictionary for parallel processing
if __name__ == "__main__":
DATASETS = {
"Dataset_1": data_preprocess.dataset_1,
"Dataset_2": data_preprocess.dataset_2,}
pool = mp.Pool(8)
pool.starmap(main, zip(DATASETS.keys(), DATASETS.values()))
pool.close()
# As I am not joining any result and I am directly saving the output
# in CSV file from (main function) I did not used pool.join()
The main function
def main(dataset_name, generate_dataset):
REGRESSORS = {
"LinReg": LinearRegression(),
"Lasso": Lasso(),}
ROOT = Path(__file__).resolve().parent
dataset_name = dataset_name
generate_dataset = generate_dataset
dfs = []
for reg_name, regressor in REGRESSORS.items():
df = function_calling(
generate_dataset=generate_dataset,
regressor=regressor,
reg_name=reg_name,)
print(df)
dfs.append(df)
df = pd.concat(dfs, axis=0, ignore_index=True)
filename = dataset_name + "_result.csv"
outfile = str(PATH) + "/" + filename
df.to_csv(outfile)
I am getting an error AssertionError: daemonic processes are not allowed to have children.
Could you tell me why I am getting the error? How can I resolve this?
To just create your own Process instances:
import multiprocessing as mp
def main(dataset_name, generate_dataset):
print(dataset_name, generate_dataset, flush=True)
... # etc.
if __name__ == "__main__":
DATASETS = {
"Dataset_1": 1,
"Dataset_2": 2,}
processes = [mp.Process(target=main, args=(k, v)) for k, v in DATASETS.items()]
for process in processes:
process.start()
# wait for termination:
for process in processes:
process.join
Prints:
Dataset_1 1
Dataset_2 2
The issue is suppose you have 8 CPU cores and DATASETS had 100 key/value pairs. You would be creating 100 processes. Assuming these processes were CPU-intensive, you could not expect more than 8 of them to really be doing anything productive. Yet you incurred the CPU and storage overhead of having created all those processes. But as long as the number of processes you will be creating are not excessively greater than the number of CPU cores you have and your function main does not need to return a value back to your main process, this should be OK.
There is also a way of implementing your own multiprocessing pool with these Process instances and a Queue instance, but that's a bit more complicated:
import multiprocessing as mp
def main(dataset_name, generate_dataset):
print(dataset_name, generate_dataset, flush=True)
... # etc.
def worker(queue):
while True:
arg = queue.get()
if arg is None:
# signal to terminate
break
# unpack
dataset_name, generate_dataset = arg
main(dataset_name, generate_dataset)
if __name__ == "__main__":
DATASETS = {
"Dataset_1": 1,
"Dataset_2": 2,}
queue = mp.Queue()
items = list(DATASETS.items())
for k, v in items:
# put the arguments on the queue
queue.put((k, v))
# number of processors we will be using:
n_processors = min(mp.cpu_count(), len(items))
for _ in range(n_processors):
# special value to tell main there is no nore work: one for each task
queue.put(None)
processes = [mp.Process(target=worker, args=(queue,)) for _ in range(n_processors)]
for process in processes:
process.start()
for process in processes:
process.join
I have a function to encrypt a bunch of strings to md5 and inside of it, I have a pool which I create.
Main.py
config = ConfigParser()
config.read("config.ini")
possibleCharacters = "abcd"
def mapped_loop_digit(args):
loop_digit(*args, is_pool=True)
def loop_digit(current_str, place, strings, hashes, is_outer=False, is_pool=False):
if place == config.getint("string_creation", "length_for_new_process"):
current_strings = list()
for character in possibleCharacters:
current_str[place] = character
if is_outer and config.getboolean("development", "minor_logging"):
print("Outer character maker at", possibleCharacters.index(character) + 1, "in", len(possibleCharacters))
elif is_pool and config.getboolean("development", "pool_minor_logging"):
print("Outest in pool character maker for process", multiprocessing.current_process()._identity[0],
"at", possibleCharacters.index(character) + 1, "in", len(possibleCharacters), "with character as",
str(character) + ". Current string is", current_str)
if place == 0:
string = "".join(_character for _character in current_str)
hashes.append(hashlib.md5(string.encode()).hexdigest())
strings.append(string)
elif place == config.getint("string_creation", "length_for_new_process"):
current_strings.append(current_str.copy())
else:
loop_digit(current_str, place - 1, strings, hashes)
if place == config.getint("string_creation", "length_for_new_process"):
args = list()
print("Starting a new pool")
for string in current_strings:
args.append([string, place - 1, strings, hashes])
with multiprocessing.Pool(processes=config.getint("string_creation", "processes")) as pool:
pool.map(mapped_loop_digit, args)
pool.close()
pool.join()
manager = multiprocessing.Manager()
all_strings = manager.list("")
all_hashes = manager.list("")
loop_digit(["", "", "", ""], 4 - 1, all_strings, all_hashes, is_outer=True)
config.ini
[development]
minor_logging = 1
pool_minor_logging = 1
[string_creation]
processes = 3
length_for_new_process = 3
At the moment I have a list called current_strings and append to it in the middle of the program, then at the end, I loop through it and create a list of the arguments to then map it to a separate function and then run the original function again. Is there an easier way to do this so I can just append to the pool instead of the list.
If you create Pool as
pool = multiprocessing.Pool(5)
without pool.close() pool.join() then you can use pool many times in different places (in different functions).
If you use map_async() instead of map() then you don't have to wait for the end of processes and you can add more processes using next map_async() and pool will manage all processes together.
You can also use apply_async to add single proces to existing pool.
Because map_async and apply_async doesn't wait for end of processses so you have to control it using wait() before exit program
it1 = pool.map_async(...)
it2 = pool.map_async(...)
it3 = pool.apply_async(...)
# ... code ...
it1.wait()
it2.wait()
it3.wait()
or you have to use (both) at the end
pool.close()
pool.join()
If you don't use it then program may exit before processes will be finished and it will terminate them.
Minimal working example
import multiprocessing
import time
def fun(number):
for x in range(3):
time.sleep(.2)
print(number, 'loop:', x)
if __name__ == '__main__':
pool = multiprocessing.Pool(2)
print("map [1,2,3]")
it1 = pool.map_async(fun, [1,2,3])
print("map ['A', 'B', 'C']")
it2 = pool.map_async(fun, ['A', 'B', 'C'])
print("single work X")
it3 = pool.apply_async(fun, 'X')
print("single work Y")
it4 = pool.apply_async(fun, 'Y')
# wait for the end of processes
print('wait for the end of processes')
#it1.wait()
#it2.wait()
#it3.wait()
#it4.wait()
pool.close()
pool.join()
print('exit')
I may be approaching this all wrong but still this is where I'm at. I have very large log files I'm trying to search, up to 30gb in some cases. I'm writing a script to pull info and have been playing with multi process to speed it up a bit. right now I'm testing running two functions at the same time to search from the top and bottom to get results, which seems to work. I'm wondering if it's possible to stop one function one a result from the other. Such as if the top function finds a result they both stop. This way I can build it out as needed.
from file_read_backwards import FileReadBackwards
from multiprocessing import Process
import sys
z = "log.log"
#!/usr/bin/env python
rocket = 0
def top():
target = "test"
with open(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
def bottom():
target = "text"
with FileReadBackwards(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
if __name__=='__main__':
p1 = Process(target = top)
p1.start()
p2 = Process(target = bottom)
p2.start()
Here's a proof-of-concept of the approach I mentioned in the comments:
import os
import random
import sys
from multiprocessing import Process, Value
def search(proc_no, file_name, seek_to, max_size, find, flag):
stop_at = seek_to + max_size
with open(file_name) as f:
if seek_to:
f.seek(seek_to - 1)
prev_char = f.read(1)
if prev_char != '\n':
# Landed in the middle of a line. Skip back one (or
# maybe more) lines so this line isn't excluded. Start
# by seeking back 256 bytes, then 512 if necessary, etc.
exponent = 8
pos = seek_to
while pos >= seek_to:
pos = f.seek(max(0, pos - (2 ** exponent)))
f.readline()
pos = f.tell()
exponent += 1
while True:
if flag.value:
break
line = f.readline()
if not line:
break # EOF
data = line.strip()
if data == find:
flag.value = proc_no
print(data)
break
if f.tell() > stop_at:
break
if __name__ == '__main__':
# list.txt contains lines with the numbers 1 to 1000001
file_name = 'list.txt'
info = os.stat(file_name)
file_size = info.st_size
if len(sys.argv) == 1:
# Pick a random value from list.txt
num_lines = 1000001
choices = list(range(1, num_lines + 1))
choices.append('XXX')
find = str(random.choice(choices))
else:
find = sys.argv[1]
num_procs = 4
chunk_size, remainder = divmod(file_size, num_procs)
max_size = chunk_size + remainder
flag = Value('i', 0)
procs = []
print(f'Using {num_procs} processes to look for {find} in {file_name}')
for i in range(num_procs):
seek_to = i * chunk_size
proc = Process(target=search, args=(i + 1, file_name, seek_to, max_size, find, flag))
procs.append(proc)
for proc in procs:
proc.start()
for proc in procs:
proc.join()
if flag.value:
print(find, 'found by proc', flag.value)
else:
print(find, 'not found')
After reading various posts[1] about reading files with multiprocessing and multithreading, it seems that neither is a great approach due to potential disk thrashing and serialized reads. So here's a different, simpler approach that is way faster (at least for the file with a million lines I was trying it out on):
import mmap
import sys
def search_file(file_name, text, encoding='utf-8'):
text = text.encode(encoding)
with open(file_name) as f:
with mmap.mmap(f.fileno(), 0, flags=mmap.ACCESS_READ, prot=mmap.PROT_READ) as m:
index = m.find(text)
if index > -1:
# Found a match; now find beginning of line that
# contains match so we can grab the whole line.
while index > 0:
index -= 1
if m[index] == 10:
index += 1
break
else:
index = 0
m.seek(index)
line = m.readline()
return line.decode(encoding)
if __name__ == '__main__':
file_name, search_string = sys.argv[1:]
line = search_file(file_name, search_string)
sys.stdout.write(line if line is not None else f'Not found in {file_name}: {search_string}\n')
I'm curious how this would perform with a 30GB log file.
[1] Including this one
Simple example using a multiprocessing.Pool and callback function.
Terminates remaining pool processes once a result has returned.
You could add an arbitrary number of processes to search from different offsets in the file using this approach.
import math
import time
from multiprocessing import Pool
from random import random
def search(pid, wait):
"""Sleep for wait seconds, return PID
"""
time.sleep(wait)
return pid
def done(result):
"""Do something with result and stop other processes
"""
print("Process: %d done." % result)
pool.terminate()
print("Terminate Pool")
pool = Pool(2)
pool.apply_async(search, (1, math.ceil(random() * 3)), callback=done)
pool.apply_async(search, (2, math.ceil(random() * 3)), callback=done)
# do other stuff ...
# Wait for result
pool.close()
pool.join() # block our main thread
This is essentially the same as Blurp's answer, but I shortened it and made it a bit to make it more general. As you can see top should be an infinite loop, but bottom stops top immediately.
from multiprocessing import Process
valNotFound = True
def top():
i=0
while ValNotFound:
i += 1
def bottom():
ValNotFound = False
p1 = Process(target = top)
p2 = Process(target = bottom)
p1.start()
p2.start()
I'm looking for a solution to do multiprocessing for running script.
I have a function which launches 4 process, and each process executes a script through runpy.run_path() and I get return back.
Example :
def valorise(product, dico_valo):
res = runpy.run_path(product +"/PyScript.py", run_name="__main__")
dico_valo[product] = res["ret"]
def f(mutex,l,dico):
while len(l)!= 0:
mutex.acquire()
product = l.pop(0)
mutex.release()
p = Process(target=valorise, args=(product,dico))
p.start()
p.join()
def run_parallel_computations(valuationDate, list_scripts):
if len(product_list)>0:
print '\n\nPARALLEL COMPUTATIONS BEGIN..........\n\n'
manager = Manager()
l = manager.list(list_scripts)
dico = manager.dict()
mutex = Lock()
p1 = Process(target=f, args=(mutex,l,dico), name="script1")
p2 = Process(target=f, args=(mutex,l,dico), name="script2")
p3 = Process(target=f, args=(mutex,l,dico), name="script3")
p4 = Process(target=f, args=(mutex,l,dico), name="script4")
p1.start()
p2.start()
p3.start()
p4.start()
p1.join()
p2.join()
p3.join()
p4.join()
dico_isin = {}
for i in iter(dico.keys()):
dico_isin[i] = dico[i]
return dico
print '\n\nPARALLEL COMPUTATIONS END..........'
else:
print '\n\nNOTHING TO PRICE !'
In every PyScript.py, I import a library and each script has to import again it. However, in this case, it doesn't work as I want and I don't understand why. My library is imported once during the first process and the same "import" is used in the other processes.
Could you help me ?
Thank you !
It might not be the case in multiprocessing (but looks like it is).
When you will try to import something more than once (ie. import re in most of your modules), Python will not 'reimport' it. As it will see it in modules already imported and will skip it.
To force reloading you can try reload(module_name) (it can not reload import of single class/method from module, you can reload whole module or nothing)