python ProcessPoolExecutor do not work when in function - python

python ProcessPoolExecutor works in command lines but not running after adding to a function
it is working like this
from concurrent import futures
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func,paras,threads)
but not working at all as below
def upper(paras,threads):
def func:
some func
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func,paras,threads)
return p
p = upper(paras,threads)
no warning or error but without any response for a long time.

Your do get an error. Its.
AttributeError: Can't pickle local object 'upper.<locals>.func'.
The reason is for multiprocessing to work it needs the function to be defined at the global level.
To achieve what you want you can do the following:
from concurrent import futures
# Has to be a global function
def func(para):
print(para)
def upper(paras,threads):
# This cannot be a local function.
#def func(para):
# print(para)
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func, paras, threads)
return p
paras = [1, 2, 3]
threads = 3
p = upper(paras,threads)

Related

Python parallel list processing not working like expected

I am using this library for parallel list processing: https://github.com/npryce/python-parallelize (Java fork-join like implementation)
This code works like expected
func = lambda x: x is not None
for i in parallelize([1, None, "3", "zsh"]):
if func(i):
print(i)
# Output: 1, "3", "zsh"
while this doesn't work:
func = lambda x: x is not None
src = []
for i in parallelize([1, None, "3", "zsh"]):
if func(i):
src.append(i)
print(src) # Output: []
The used library code looks like that:
import sys
from itertools import islice
import os
import multiprocessing
def per_cpu(seq):
cpu_count = multiprocessing.cpu_count()
return (islice(seq, cpu, None, cpu_count) for cpu in range(cpu_count))
def parallelize(seq, fork=per_cpu):
pids = []
for slice in fork(seq):
pid = os.fork()
if pid == 0:
yield from slice
sys.exit(0)
else:
pids.append(pid)
for pid in pids:
os.waitpid(pid, 0)
Update: I tried to use a shared list. It works but multiprocessing shows an assertion error
manager = Manager()
shared_list = manager.list()
func = lambda x: x is not None
for i in parallelize([1, None, "3", "zsh"]):
if func(i):
shared_list.append(i)
print(shared_list)
The error is:
File "/usr/lib/python3.8/multiprocessing/process.py", line 147, in join
assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process
What am I doing wrong?
Thanks in advance!

how to pass a dictionary to concurrent futures executor

I'm new to using concurrent futures and I cannot find any examples on how to do this. I have the global dictionary, data, that I want the function called by the concurrent futures executor to add results to. The function works but there is no output in data.
Thanks for any help,
T.
def estimate_shannon_entropy(dna_sequence):
bases = collections.Counter([tmp_base for tmp_base in dna_sequence])
# define distribution
dist = [x/sum(bases.values()) for x in bases.values()]
# use scipy to calculate entropy
entropy_value = entropy(dist, base=2)
#norm_ent = entropy_value/math.log(len(dna_sequence),2)
return entropy_value
def shan(i):
name1=i.split("/")[-1]
ext1=name1.split(".")[-1]
print(name1)
if ext1=="gz":
#print("gz detected")
f=gzip.open(i,'rt')
k=name1.split(".")[-2]
else:
f=open(i,'r')
k=ext
if k[-1]=="a":
fmt="fasta"
#print("fasta")
if k[-1]=="q":
fmt="fastq"
#print("fastq")
c=0
shannon_total=0
for x in SeqIO.parse(f,fmt):
c=c+1
if c<=samples:
shannon = estimate_shannon_entropy(str(x.seq))
shannon_total = shannon_total +shannon
ans=float(shannon_total/samples)
data[name1]=ans
folder=sys.argv[1]
filelist=glob.glob(folder)
filelist.sort(key=tokenize)
#print(filelist)
samples=int(sys.argv[2])
threads=int(sys.argv[3])
global data
data={}
executor = concurrent.futures.ProcessPoolExecutor(threads)
futures = [executor.submit(shan, i) for i in filelist]
concurrent.futures.wait(futures)
print(data)
Ok, I found the answer, will leave here in case there are better methods (sure there are).
Used Manager:
from multiprocessing import Manager
manager=Manager()
data=manager.dict()
executor = concurrent.futures.ProcessPoolExecutor(threads)
futures = [executor.submit(shan, i,data) for i in filelist]
concurrent.futures.wait(futures)

TypeError: 'MapResult' object is not iterable using pathos.multiprocessing

I'm running a spell correction function on a dataset I have. I used from pathos.multiprocessing import ProcessingPool as Pool to do the job. Once the processing is done, I'd like to actually access the results. Here is my code:
import codecs
import nltk
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
from pathos.multiprocessing import ProcessingPool as Pool
class SpellCorrect():
def load_data(self, path_1):
with codecs.open(path_1, "r", "utf-8") as file:
data = file.read()
return sent_tokenize(data)
def correct_spelling(self, data):
data = TextBlob(data)
return str(data.correct())
def run_clean(self, path_1):
pool = Pool()
data = self.load_data(path_1)
return pool.amap(self.correct_spelling, data)
if __name__ == "__main__":
path_1 = "../Data/training_data/training_corpus.txt"
SpellCorrect = SpellCorrect()
result = SpellCorrect.run_clean(path_1)
print(result)
result = " ".join(temp for temp in result)
with codecs.open("../Data/training_data/training_data_spell_corrected.txt", "a", "utf-8") as file:
file.write(result)
If you look at the main block, when I do print(result) I get an object of type <multiprocess.pool.MapResult object at 0x1a25519f28>.
I try to access the results with result = " ".join(temp for temp in result), but then I get the following error TypeError: 'MapResult' object is not iterable. I've tried typecasting it to a list list(result), but still the same error. What can I do to fix this?
The multiprocess.pool.MapResult object is not iterable as it is inherited from AsyncResult and has only the following methods:
wait([timeout])
Wait until the result is available or until timeout seconds pass. This method always returns None.
ready() Return whether the call has completed.
successful() Return whether the call completed without raising an
exception. Will raise AssertionError if the result is not ready.
get([timeout]) Return the result when it arrives. If timeout is not
None and the result does not arrive within timeout seconds then
TimeoutError is raised. If the remote call raised an exception then
that exception will be reraised as a RemoteError by get().
You can check the examples how to use the get() function here:
https://docs.python.org/2/library/multiprocessing.html#using-a-pool-of-workers
from multiprocessing import Pool, TimeoutError
import time
import os
def f(x):
return x*x
if __name__ == '__main__':
pool = Pool(processes=4) # start 4 worker processes
# print "[0, 1, 4,..., 81]"
print pool.map(f, range(10))
# print same numbers in arbitrary order
for i in pool.imap_unordered(f, range(10)):
print i
# evaluate "f(20)" asynchronously
res = pool.apply_async(f, (20,)) # runs in *only* one process
print res.get(timeout=1) # prints "400"
# evaluate "os.getpid()" asynchronously
res = pool.apply_async(os.getpid, ()) # runs in *only* one process
print res.get(timeout=1) # prints the PID of that process
# launching multiple evaluations asynchronously *may* use more processes
multiple_results = [pool.apply_async(os.getpid, ()) for i in range(4)]
print [res.get(timeout=1) for res in multiple_results]
# make a single worker sleep for 10 secs
res = pool.apply_async(time.sleep, (10,))
try:
print res.get(timeout=1)
except TimeoutError:
print "We lacked patience and got a multiprocessing.TimeoutError"

Python 2.7: How to compensate for missing pool.starmap?

I have defined this function
def writeonfiles(a,seed):
random.seed(seed)
f = open(a, "w+")
for i in range(0,10):
j = random.randint(0,10)
#print j
f.write(j)
f.close()
Where a is a string containing the path of the file and seed is an integer seed.
I want to parallelize a simple program in such a way that each core takes one of the available paths that I give in, seeds its random generator and write some random numbers on that files, so, for example, if I pass the
vector
vector = [Test/file1.txt, Test/file2.txt]
and the seeds
seeds = (123412, 989898),
it gives to the first available core the function
writeonfiles(Test/file1.txt, 123412)
and to the second one the same function with different arguments:
writeonfiles(Test/file2.txt, 989898)
I have looked through a lot of similar questions here on Stackoverflow, but I cannot make any solution work.
What I tried is:
def writeonfiles_unpack(args):
return writeonfiles(*args)
if __name__ == "__main__":
folder = ["Test/%d.csv" %i for i in range(0,4)]
seed = [234124, 663123, 12345 ,123833]
p = multiprocessing.Pool()
p.map(writeonfiles, (folder,seed))
and gives me TypeError: writeonfiles() takes exactly 2 arguments (1 given).
I tried also
if __name__ == "__main__":
folder = ["Test/%d.csv" %i for i in range(0,4)]
seed = [234124, 663123, 12345 ,123833]
p = multiprocessing.Process(target=writeonfiles, args= [folder,seed])
p.start()
But it gives me
File "/usr/lib/python2.7/random.py", line 120, in seed
super(Random, self).seed(a)
TypeError: unhashable type: 'list'
Finally, I tried the contextmanager
#contextmanager
def poolcontext(*args, **kwargs):
pool = multiprocessing.Pool(*args, **kwargs)
yield pool
pool.terminate()
if __name__ == "__main__":
folder = ["Test/%d" %i for i in range(0,4)]
seed = [234124, 663123, 12345 ,123833]
a = zip(folder, seed)
with poolcontext(processes = 3) as pool:
results = pool.map(writeonfiles_unpack,a )
and it results in
File "/usr/lib/python2.7/multiprocessing/pool.py", line 572, in get
raise self._value
TypeError: 'module' object is not callable
Python 2.7 lacks the starmap pool-method from Python 3.3+ . You can overcome this by decorating your target function with a wrapper, which unpacks the argument-tuple and calls the target function:
import os
from multiprocessing import Pool
import random
from functools import wraps
def unpack(func):
#wraps(func)
def wrapper(arg_tuple):
return func(*arg_tuple)
return wrapper
#unpack
def write_on_files(a, seed):
random.seed(seed)
print("%d opening file %s" % (os.getpid(), a)) # simulate
for _ in range(10):
j = random.randint(0, 10)
print("%d writing %d to file %s" % (os.getpid(), j, a)) # simulate
if __name__ == '__main__':
folder = ["Test/%d.csv" % i for i in range(0, 4)]
seed = [234124, 663123, 12345, 123833]
arguments = zip(folder, seed)
pool = Pool(4)
pool.map(write_on_files, iterable=arguments)
pool.close()
pool.join()

Log process output to separate log file

I have the following parallel_executor.py module which I use to run several processes simultaneously,
import time
from multiprocessing import Process
class ParallelExecutor(object):
def __init__(self, pool_size=10):
self._pool_size = pool_size
self._processes = []
self._results = []
def add_task(self, target, args=None, kwargs=None):
args = [] if not args else args
kwargs = {} if not kwargs else kwargs
index = len(self._processes)
process_args = (index, target, args, kwargs)
process = Process(target=self._executor, args=process_args)
self._processes.append(process)
result = {'result': None, 'end_time': 0, 'completed': False}
self._results.append(result)
return index
def run(self, block=True):
if not block:
for process in self._processes:
process.start()
return None
else:
counter = 0
processes = []
for process in self._processes:
processes.append(process)
process.start()
if counter >= self._pool_size:
# Wait for completion and reset counters.
for i in range(len(processes)):
processes[i].join()
processes = []
counter = 0
continue
counter += 1
# Wait for the left over processes to complete.
if len(processes) > 0:
for i in range(len(processes)):
processes[i].join()
return self._results
def _executor(self, index, target, args, kwargs):
try:
self._results[index]['result'] = target(*args, **kwargs)
self._results[index]['end_time'] = int(round((time.time())))
self._results[index]['completed'] = True
except Exception as exc:
self._results[index]['exception'] = exc
self._results[index]['completed'] = True
raise
And I use it as follows(example.py):
from framework.lib.parallel_executor import ParallelExecutor
import time
import os
def foo(x):
for i in range(3):
print x
time.sleep(0.5)
return 123
def main():
runner = ParallelExecutor()
runner.add_task(foo, ["This"])
runner.add_task(foo, ["is"])
runner.add_task(foo, ["a"])
runner.add_task(foo, ["test"])
runner.run()
runner.wait_for_executor_to_finish()
for i in runner.get_results():
print i
main()
My question is how do I print the process ID with every statement of 'foo' that is printed to the output by making changes only to parallel_executor.py module and not touching the example.py file, so that later I could perform a 'grep' on outputs of a particular process.
You can't do it without modifying the example at all, but you can achieve what you want with a very small modification.
Using the Python logging facilities, you can set the default log message ensuring every logline will respect your standard.
In the parallel_executor.py add the following:
import logging
log_format = "%(process)d: %(message)s"
logging.basicConfig(level=logging.INFO, format=log_format)
In the example replace the line:
print x
with:
logging.info(x)
And you will see your messages appearing as:
34321: message content here

Categories

Resources