Python parallel list processing not working like expected

Python parallel list processing not working like expected - python

I am using this library for parallel list processing: https://github.com/npryce/python-parallelize (Java fork-join like implementation)
This code works like expected
func = lambda x: x is not None
for i in parallelize([1, None, "3", "zsh"]):
if func(i):
print(i)
# Output: 1, "3", "zsh"
while this doesn't work:
func = lambda x: x is not None
src = []
for i in parallelize([1, None, "3", "zsh"]):
if func(i):
src.append(i)
print(src) # Output: []
The used library code looks like that:
import sys
from itertools import islice
import os
import multiprocessing
def per_cpu(seq):
cpu_count = multiprocessing.cpu_count()
return (islice(seq, cpu, None, cpu_count) for cpu in range(cpu_count))
def parallelize(seq, fork=per_cpu):
pids = []
for slice in fork(seq):
pid = os.fork()
if pid == 0:
yield from slice
sys.exit(0)
else:
pids.append(pid)
for pid in pids:
os.waitpid(pid, 0)
Update: I tried to use a shared list. It works but multiprocessing shows an assertion error
manager = Manager()
shared_list = manager.list()
func = lambda x: x is not None
for i in parallelize([1, None, "3", "zsh"]):
if func(i):
shared_list.append(i)
print(shared_list)
The error is:
File "/usr/lib/python3.8/multiprocessing/process.py", line 147, in join
assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process
What am I doing wrong?
Thanks in advance!

Related

Python: store results of ProcessPoolExecutor

I'm very new to parallel processing with "concurrent.futures". Code seems to work, but I am not sure how to store the result of each process, therey by marking the build as failed at last, if any of processes's return value is not zero.
Tried to create a list (exit_status) and append the results to that, but that shows IndexError. Wondering what can I do right?
#!/usr/bin/env python3
import concurrent.futures
import sys
import shutil
import os
import glob
import multiprocessing as mp
import json
from os import path
def slave(path1, path2, target):
os.makedirs(target)
shutil.copy(path1, target)
shutil.copy(path2, target)
os.system(<Login command>)
os.system(<Image creation command>)
os.system(<Copy to Other slaves or NFS>)
#If any one of the above operation or command fails for any of the process, the script should return 1 at the end of the execution or fail the build at last.
def main():
processed = {}
exit_status = []
with open('example.json', 'r') as f:
data = json.load(f)
for value in data.items():
for line in value[1]:
if line.endswith('.zip'):
targz = line
elif line.endswith('.yaml'):
yaml = line
processed[targz] = yaml
with concurrent.futures.ProcessPoolExecutor() as executor:
for id, (path2, path1) in enumerate(processed.items(), 1):
target = path.join("/tmp", "dir" + str(id))
ret = executor.submit(slave, path1, path2, target)
exit_status.append(ret.result())
for i in exit_status:
print("##########Result status: ", i)
if __name__ == "__main__":
mp.set_start_method('spawn')
main()
exit_status list's output:
##########Result status: None
##########Result status: None

re; comments
If you want to get the result of a system call in order to act on the results of it, using subprocess.run is much more flexible and powerful than os.system. Additionally, if you actually want to perform the operations in parallel, you can't wait on result() after each task. Otherwise you're only ever doing one thing at a time. Better to submit all the tasks, and collect the Future objects. Then you can iterate over those and wait on each result() now that you've submitted all the work you want the executor to do.
def target_func(path1, path2, target):
#...
#instead of os.system, use subprocess.run
#you can inspect the stdout from the process
complete_process = subprocess.run(<Login command>, text=True, capture_output=True)
if "success" not in complete_process.stdout:
return "uh-oh"
#you can also just check the return value (0 typically means clean exit)
if subprocess.run(<Image creation command>).returncode == 0:
return "uh-oh"
#or you can tell `run` to generate an error if the returncode is non-zero
try:
subprocess.run(<Copy to Other slaves or NFS>, check=True)
except subprocess.CalledProcessError:
return "uh-oh"
return "we did it!"
def main():
#...
#...
with concurrent.futures.ProcessPoolExecutor() as executor:
for id, (path2, path1) in enumerate(processed.items(), 1):
target = path.join("/tmp", "dir" + str(id))
ret = executor.submit(slave, path1, path2, target)
exit_status.append(ret)
for i in exit_status:
print("##########Result status: ", i.result())

python ProcessPoolExecutor do not work when in function

python ProcessPoolExecutor works in command lines but not running after adding to a function
it is working like this
from concurrent import futures
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func,paras,threads)
but not working at all as below
def upper(paras,threads):
def func:
some func
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func,paras,threads)
return p
p = upper(paras,threads)
no warning or error but without any response for a long time.

Your do get an error. Its.
AttributeError: Can't pickle local object 'upper.<locals>.func'.
The reason is for multiprocessing to work it needs the function to be defined at the global level.
To achieve what you want you can do the following:
from concurrent import futures
# Has to be a global function
def func(para):
print(para)
def upper(paras,threads):
# This cannot be a local function.
#def func(para):
# print(para)
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func, paras, threads)
return p
paras = [1, 2, 3]
threads = 3
p = upper(paras,threads)

TypeError: 'MapResult' object is not iterable using pathos.multiprocessing

I'm running a spell correction function on a dataset I have. I used from pathos.multiprocessing import ProcessingPool as Pool to do the job. Once the processing is done, I'd like to actually access the results. Here is my code:
import codecs
import nltk
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
from pathos.multiprocessing import ProcessingPool as Pool
class SpellCorrect():
def load_data(self, path_1):
with codecs.open(path_1, "r", "utf-8") as file:
data = file.read()
return sent_tokenize(data)
def correct_spelling(self, data):
data = TextBlob(data)
return str(data.correct())
def run_clean(self, path_1):
pool = Pool()
data = self.load_data(path_1)
return pool.amap(self.correct_spelling, data)
if __name__ == "__main__":
path_1 = "../Data/training_data/training_corpus.txt"
SpellCorrect = SpellCorrect()
result = SpellCorrect.run_clean(path_1)
print(result)
result = " ".join(temp for temp in result)
with codecs.open("../Data/training_data/training_data_spell_corrected.txt", "a", "utf-8") as file:
file.write(result)
If you look at the main block, when I do print(result) I get an object of type <multiprocess.pool.MapResult object at 0x1a25519f28>.
I try to access the results with result = " ".join(temp for temp in result), but then I get the following error TypeError: 'MapResult' object is not iterable. I've tried typecasting it to a list list(result), but still the same error. What can I do to fix this?

The multiprocess.pool.MapResult object is not iterable as it is inherited from AsyncResult and has only the following methods:
wait([timeout])
Wait until the result is available or until timeout seconds pass. This method always returns None.
ready() Return whether the call has completed.
successful() Return whether the call completed without raising an
exception. Will raise AssertionError if the result is not ready.
get([timeout]) Return the result when it arrives. If timeout is not
None and the result does not arrive within timeout seconds then
TimeoutError is raised. If the remote call raised an exception then
that exception will be reraised as a RemoteError by get().
You can check the examples how to use the get() function here:
https://docs.python.org/2/library/multiprocessing.html#using-a-pool-of-workers
from multiprocessing import Pool, TimeoutError
import time
import os
def f(x):
return x*x
if __name__ == '__main__':
pool = Pool(processes=4) # start 4 worker processes
# print "[0, 1, 4,..., 81]"
print pool.map(f, range(10))
# print same numbers in arbitrary order
for i in pool.imap_unordered(f, range(10)):
print i
# evaluate "f(20)" asynchronously
res = pool.apply_async(f, (20,)) # runs in *only* one process
print res.get(timeout=1) # prints "400"
# evaluate "os.getpid()" asynchronously
res = pool.apply_async(os.getpid, ()) # runs in *only* one process
print res.get(timeout=1) # prints the PID of that process
# launching multiple evaluations asynchronously *may* use more processes
multiple_results = [pool.apply_async(os.getpid, ()) for i in range(4)]
print [res.get(timeout=1) for res in multiple_results]
# make a single worker sleep for 10 secs
res = pool.apply_async(time.sleep, (10,))
try:
print res.get(timeout=1)
except TimeoutError:
print "We lacked patience and got a multiprocessing.TimeoutError"

Log process output to separate log file

I have the following parallel_executor.py module which I use to run several processes simultaneously,
import time
from multiprocessing import Process
class ParallelExecutor(object):
def __init__(self, pool_size=10):
self._pool_size = pool_size
self._processes = []
self._results = []
def add_task(self, target, args=None, kwargs=None):
args = [] if not args else args
kwargs = {} if not kwargs else kwargs
index = len(self._processes)
process_args = (index, target, args, kwargs)
process = Process(target=self._executor, args=process_args)
self._processes.append(process)
result = {'result': None, 'end_time': 0, 'completed': False}
self._results.append(result)
return index
def run(self, block=True):
if not block:
for process in self._processes:
process.start()
return None
else:
counter = 0
processes = []
for process in self._processes:
processes.append(process)
process.start()
if counter >= self._pool_size:
# Wait for completion and reset counters.
for i in range(len(processes)):
processes[i].join()
processes = []
counter = 0
continue
counter += 1
# Wait for the left over processes to complete.
if len(processes) > 0:
for i in range(len(processes)):
processes[i].join()
return self._results
def _executor(self, index, target, args, kwargs):
try:
self._results[index]['result'] = target(*args, **kwargs)
self._results[index]['end_time'] = int(round((time.time())))
self._results[index]['completed'] = True
except Exception as exc:
self._results[index]['exception'] = exc
self._results[index]['completed'] = True
raise
And I use it as follows(example.py):
from framework.lib.parallel_executor import ParallelExecutor
import time
import os
def foo(x):
for i in range(3):
print x
time.sleep(0.5)
return 123
def main():
runner = ParallelExecutor()
runner.add_task(foo, ["This"])
runner.add_task(foo, ["is"])
runner.add_task(foo, ["a"])
runner.add_task(foo, ["test"])
runner.run()
runner.wait_for_executor_to_finish()
for i in runner.get_results():
print i
main()
My question is how do I print the process ID with every statement of 'foo' that is printed to the output by making changes only to parallel_executor.py module and not touching the example.py file, so that later I could perform a 'grep' on outputs of a particular process.

You can't do it without modifying the example at all, but you can achieve what you want with a very small modification.
Using the Python logging facilities, you can set the default log message ensuring every logline will respect your standard.
In the parallel_executor.py add the following:
import logging
log_format = "%(process)d: %(message)s"
logging.basicConfig(level=logging.INFO, format=log_format)
In the example replace the line:
print x
with:
logging.info(x)
And you will see your messages appearing as:
34321: message content here

Running same function for multiple files in parallel in python

I am trying to run a function in parallel for multiple files and want all of them to terminate before a point.
For Example:
There is a loop
def main():
for item in list:
function_x(item)
function_y(list)
Now what I want is that this function_x should run in parallel for all items. But this function should be executed for all items before my function_y is called.
I am planning to use celery for this. but can not understand how to do this.

Here is my final test code.
All I needed to do is use multiprocessing library.
from multiprocessing import Process
from time import sleep
Pros = []
def function_x(i):
for j in range(0,5):
sleep(3)
print i
def function_y():
print "done"
def main():
for i in range(0,3):
print "Thread Started"
p = Process(target=function_x, args=(i,))
Pros.append(p)
p.start()
# block until all the threads finish (i.e. block until all function_x calls finish)
for t in Pros:
t.join()
function_y()

you can use threads for this. thread.join is the function you need, this function block until the thread is finished.
you can do this:
import threading
threads = []
def main():
for item in list:
t = threading.Thread(target=function_x, args=(item,))
threads.append(t)
t.start()
# block until all the threads finish (i.e. until all function_a functions finish)
for t in threads:
t.join()
function_y(list)

You can do this elegantly with Ray, which is a library for writing parallel and distributed Python.
Simply declare the function_x with #ray.remote, and then it can be executed in parallel by invoking it with function_x.remote and the results can be retrieved with ray.get.
import ray
import time
ray.init()
#ray.remote
def function_x(item):
time.sleep(1)
return item
def function_y(list):
pass
list = [1, 2, 3, 4]
# Process the items in parallel.
results = ray.get([function_x.remote(item) for item in list])
function_y(list)
View the Ray documentation.

Here is the documentation for celery groups, which is what I think you want. Use AsyncResult.get() instead of AsyncResult.ready() to block.

#!/bin/env python
import concurrent.futures
def function_x(item):
return item * item
def function_y(lst):
return [x * x for x in lst]
a_list = range(10)
if __name__ == '__main__':
with concurrent.futures.ThreadPoolExecutor(10) as tp:
future_to_function_x = {
tp.submit(function_x, item): item
for item in a_list
}
results = {}
for future in concurrent.futures.as_completed(future_to_function_x):
item = future_to_function_x[future]
try:
res = future.result()
except Exception as e:
print('Exception when processing item "%s": %s' % (item, e))
else:
results[item] = res
print('results:', results)
after = function_y(results.values())
print('after:', after)
Output:
results: {0: 0, 1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 6: 36, 7: 49, 8: 64, 9: 81}
after: [0, 1, 16, 81, 256, 625, 1296, 2401, 4096, 6561]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python parallel list processing not working like expected - python

Related

Python: store results of ProcessPoolExecutor

python ProcessPoolExecutor do not work when in function

TypeError: 'MapResult' object is not iterable using pathos.multiprocessing

Log process output to separate log file

Running same function for multiple files in parallel in python

Categories

Resources