I'm starting to learn python and have to write a script which takes advantages of multiprocessing. It looks as follows:
import multiprocessing as mp
import os
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
import PoseOptimization2 as opt
import telemMS_Vicon as extr
import Segmentation as seg
result_list = []
def log_result(result):
# This is called whenever foo_pool(i) returns a result.
# result_list is modified only by the main process, not the pool workers.
result_list.append(result)
print('Frame:',result[1])
def multiproc(seg):
pool = mp.Pool(mp.cpu_count())
for i in range(0,len(seg)):
pool.apply_async(opt.optimize, args=(seg[0],seg[i],False,False,i),callback=log_result)
pool.close()
pool.join()
if __name__ == "__main__":
#extract point cloud data (list of arrays)
OutputObj = extr.extraction('MyoSuit Custom Feasibility')
segments = seg.segmentation(OutputObj)
multiproc(segments[0])
list1 = result_list
result_list = []
In principle, the variable "segments" contains several lists. The entries of each list shall then be modified by using the function "optimize" in the "PoseOptimization2" script. To make this faster, I want to use multiprocessing.
But when I try to run the function "multiproc", there is the error "ModuleNotFoundErrorError: No module named PoseOptimization2" and the program has to be restarted in order to work again.
How can I properly pass a function from another script to the multiprocessing function?
Thank you very much for your help!
Related
I'm new in python and I have a concurrent problem when using internal functions of importing libraries. The problem is that my code calculates different kinds of variables and in the last process they are saved into different files. But I have the same problem when reading and writing.
This is an example code that works because is linear:
import xarray as xr
def read_concurrent_files(self):
files_var_type1 = get_files('type1','20200101','20200127')
files_var_type2 = get_files('type2','20200101','20200127')
files_var_type3 = get_files('type3','20200101','20200127')
def get_files(self, varType, dateini, datefin):
# This methods return an array of file paths
files = self.get_file_list(varType, dateini, datefin)
files_raw = xr.open_mfdataset(files , engine='cfgrib', \
combine='nested', concat_dim ='time', decode_coords = False, parallel = True)
return files_raw
But when I make these changes to the code to be concurrent it fails:
import xarray as xr
from multiprocessing.pool import ThreadPool
def read_concurrent_files(self):
pool = ThreadPool(processes=3)
async_result1 = pool.apply_async(self.get_files, ('type1','20200101','20200127',))
async_result2 = pool.apply_async(self.get_files, ('type2','20200101','20200127',))
async_result3 = pool.apply_async(self.get_files, ('type3','20200101','20200127',))
files_var_type1 = async_result1.get()
files_var_type2 = async_result2.get()
files_var_type3 = async_result3.get()
def get_files(self, varType, dateini, datefin):
# This methods return an array of file paths
files = self.get_file_list(varType, dateini, datefin)
files_raw = xr.open_mfdataset(files , engine='cfgrib', \
combine='nested', concat_dim ='time', decode_coords = False, parallel = True)
return files_raw
The problem is in the xr.open_mfdataset call that is not ThreadSafe (or I think so).
Is there a way to encapsulate the import library into the method scope only?
I came from other languages and that was easy creating the instance into the method or using ThreadSafe objects.
Thanks a lot in advance!!
As I'm new in python I was unaware of the different kinds of threads that we can create, so in my example above, I was using the ThreadPool that can be locked by the GIL (Global Interpreter Lock), so to avoid it there is another kind of threads we can use, here an example:
import os
import concurrent.futures
def get_xarray(self):
tasks = []
cpu_count = os.cpu_count()
with concurrent.futures.ProcessPoolExecutor(max_workers = cpu_count) as executor:
for i in range(0, len(self.files)):
tasks.append(executor.submit(self.get_xarray_by_file, self.files[i]))
results = []
for result in tasks:
results.append(result.result())
era_raw = xr.merge(results, compat='override')
return era_raw.persist().load()
def get_xarray_by_file(self, files):
era_raw = xr.open_mfdataset(files , engine='cfgrib', \
combine='nested', concat_dim ='time', decode_coords = False, parallel = True)
return era_raw.persist().load()
In that case, we use the ProcessPoolExecutor:
The ProcessPoolExecutor class is an Executor subclass that uses a pool of processes to execute calls asynchronously. ProcessPoolExecutor uses the multiprocessing module, which allows it to side-step the Global Interpreter Lock but also means that only pickable objects can be executed and returned.
Now we can read in parallel grib2 files, or create nc or csv files from a dataframe in real parallel.
I'm trying to improving the interactive output of small CLI program walking a directory to process files, and using a Rich progress bar to display the progression of the tasks.
At the moment, I'm doing this in 2 steps:
pool.submit() all the tasks
for future in as_completed(xxxx) wait for the next future available.
The problem is that the first step (pool.submit) might take some time (since I'm walking the directory), and the UI isn't updated, even though futures have already been available.
So, I tried to come up with a Thread that would submit on my pool, while the main thread would wait on the next Future and update the UI:
"""
Usage: walker.py [options] <file/directory>...
Options:
-r --recursive Walk directories recursively
-w WORKERS --workers=WORKERS Specify the number of process pool workers [default: 4]
-d --debug Enable debug output
-h --help Display this message
"""
import os
import threading
import time
from concurrent.futures._base import as_completed
from concurrent.futures.process import ProcessPoolExecutor
from pathlib import Path
from random import randint
from typing import List
from docopt import docopt
from rich.console import Console
from rich.progress import BarColumn, Progress, TextColumn
def walk_filepath_list(filepath_list: List[Path], recursive: bool = False):
for path in filepath_list:
if path.is_dir() and not path.is_symlink():
if recursive:
for f in os.scandir(path):
yield from walk_filepath_list([Path(f)], recursive)
else:
yield from (Path(f) for f in os.scandir(path))
elif path.is_file():
yield path
def process_task(filepath):
rand = randint(0, 1)
time.sleep(rand)
def thread_submit(pool, filepath_list, recursive, future_to_filepath):
for filepath in walk_filepath_list(filepath_list, recursive):
future = pool.submit(process_task, filepath)
# update shared dict
future_to_filepath[future] = filepath
def main(args):
filepath_list = [Path(entry) for entry in args["<file/directory>"]]
debug = args["--debug"]
workers = int(args["--workers"])
recursive = args["--recursive"]
console = Console()
process_bar = Progress(
TextColumn("[bold blue]Processing...", justify="left"),
BarColumn(bar_width=None),
"{task.completed}/{task.total}",
"•",
"[progress.percentage]{task.percentage:>3.1f}%",
console=console,
)
process_bar.start()
# we need to consume the iterator once to get the total
# for the progress bar
count = sum(1 for i in walk_filepath_list(filepath_list, recursive))
task_process_bar = process_bar.add_task("Main task", total=count)
with ProcessPoolExecutor(max_workers=workers) as pool:
# shared dict between threads
# [Future] => [filepath]
future_to_filepath = {}
submit_thread = threading.Thread(
target=thread_submit, args=(pool, filepath_list, recursive, future_to_filepath)
)
submit_thread.start()
while len(future_to_filepath.keys()) != count:
for future in as_completed(future_to_filepath):
filepath = future_to_filepath[future]
# print(f"processing future: {filepath}")
try:
data = future.result()
finally:
# update progress bar
process_bar.update(task_process_bar, advance=1)
process_bar.stop()
def entrypoint():
args = docopt(__doc__)
main(args)
if __name__ == "__main__":
entrypoint()
However, the progress bar isn't updated as expected.
Worse, there are cases where the processing doesn't seem to end.
is it a race conditions when I update my dict future_to_filepath ?
how would you go to have a submit thread and a process_results thread with concurrent.futures ?
Thank you SO !
See my comments to your question and then:
Change:
submit_thread = threading.Thread(
target=thread_submit, args=(pool, filepath_list, recursive, future_to_filepath)
)
submit_thread.start()
To:
thread_submit(pool, filepath_list, recursive, future_to_filepath)
(a change to this function name, since it is no longer running as a separate thread, would be a good thing -- how about create_futures?)
And remove the outer loop:
while len(future_to_filepath.keys()) != count:
Finally, it is not clear what your real process_task will do with the file but it certainly seems possible that it will be I/O bound. In that case, you might benefit instead from using the ThreadPoolExecutor class, easily substitutable for the ProcessPoolExecutor class, in which case you should consider specifying a much larger number of workers, possibly equal to count. Since your current process_task is doing nothing much more than sleeping, it would probably profit from threading with the larger number of workers.
Update
One thing you can do to reduce the time it takes to run walk_filepath_list is to modify the function to be passed a single path to walk rather than a list and to process each path that was in the original list concurrently in separate threads. In the code below I am using the ThreadPoolExecutor map function for convenience which really requires that the arguments to the (newly renamed) walk_filepath function be reversed so that I can use functools.partial to "harcode" the first argument, recursive, for all the calls:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
def walk_filepath(recursive: bool = False, path: Path = None):
if path.is_dir() and not path.is_symlink():
if recursive:
for f in os.scandir(path):
yield from walk_filepath(recursive, Path(f))
else:
yield from (Path(f) for f in os.scandir(path))
elif path.is_file():
yield path
def walker(recursive, path):
return list(walk_filepath(recursive, path))
def thread_submit(pool, filepath_list, recursive, future_to_filepath):
n_workers = len(filepath_list)
with ThreadPoolExecutor(max_workers=n_workers) as executor:
filepath_lists = executor.map(partial(walker, recursive), filepath_list)
for filepath_list in filepath_lists:
for filepath in filepath_list:
future = pool.submit(process_task, filepath)
# update shared dict
future_to_filepath[future] = filepath
Update 2
A benchmark of the above code reveals that it does not save time (perhaps if the directories were on different physical drives?).
In one of my research projects, I am trying to count certain words from 170,000 text files. I have a functional for loop to do the job but it is painful to watch only 20% of the CPU is being used:
import pandas as pd
import re
def normalize_text(text):
some process to normalize the text
return text
# I created a filelist dataframe prior to execute this function
def countwords(filelist):
global wc
header_list=['file','wda', 'wdb', 'wdc', 'wdd', 'wde','wdf']
wc=pd.DataFrame()
wc = wc.reindex(columns = header_list)
for i in range(filelist.shape[0]):
words = ['wda', 'wdb', 'wdc', 'wdd', 'wde','wdf']
count={}
for elem in words:
count[elem] = 0
file=open(filelist.iloc[i].at['location'], encoding='latin-1')
full=file.read()
text=normalize_text(full)
for word in words:
count[word] = len(re.findall(word,text))
wc = wc.append(count, sort=False,ignore_index=True)
wc=wc
I am trying to modify my code to use the multiprocessing package see if I can process multiple files at a time.
I am new to the package, here is a modified version:
import re
def countwords(filedest):
words = ['wda', 'wdb', 'wdc', 'wdd', 'wde','wdf']
count={}
for elem in words:
count[elem] = 0
count.update({'file' : filedest})
file=open(filedest, encoding='latin-1')
full=file.read()
text=normalize_text(full)
for word in words:
count[word] = len(re.findall(word,text))
return count
mydir = os.path.join('C:\\',"filedest\*.txt")
from multiprocessing.pool import ThreadPool
import glob2
if __name__ == '__main__':
tasks = glob2.glob(str(mydir))
pool = ThreadPool()
results=pool.map_async(countwords,tasks)
pool.close()
pool.join()
#results are handled after pool.join
I noticed the code is being processed for an abnormal amount of time, after keyboard interruption, the code continues to run for a while and finishes the job, (however if I do not interrupt it will keep hanging for infinite time and explode my memory).
I have tried switching between processpool and threadpool, warp the result with another function but does not seem to be of any use.
I am using anaconda distribution with Spyder IDE.
I appreciate your time and help
I am trying to download images from a list of URLs using Python. To make the process faster, I used the multiprocessing library.
The problem I am facing is that the script often hangs/freezes on its own, and I don't know why.
Here is the code that I am using
...
import multiprocessing as mp
def getImages(val):
#Dowload images
try:
url= # preprocess the url from the input val
local= #Filename Generation From Global Varables And Rand Stuffs...
urllib.request.urlretrieve(url,local)
print("DONE - " + url)
return 1
except Exception as e:
print("CAN'T DOWNLOAD - " + url )
return 0
if __name__ == '__main__':
files = "urls.txt"
lst = list(open(files))
lst = [l.replace("\n", "") for l in lst]
pool = mp.Pool(processes=4)
res = pool.map(getImages, lst)
print ("tempw")
It often gets stuck halfway through the list (it prints DONE, or CAN't DOWNLOAD to half of the list it has processed but I don't know what is happening on the rest of them). Has anyone faced this problem? I have searched for similar problems (e.g. this link) but found no answer.
Thanks in advance
Ok, I have found an answer.
A possible culprit was the script was stuck in connecting/downloading from the URL. So what I added was a socket timeout to limit the time to connect and download the image.
And now, the issue no longer bothers me.
Here is my complete code
...
import multiprocessing as mp
import socket
# Set the default timeout in seconds
timeout = 20
socket.setdefaulttimeout(timeout)
def getImages(val):
#Dowload images
try:
url= # preprocess the url from the input val
local= #Filename Generation From Global Varables And Rand Stuffs...
urllib.request.urlretrieve(url,local)
print("DONE - " + url)
return 1
except Exception as e:
print("CAN'T DOWNLOAD - " + url )
return 0
if __name__ == '__main__':
files = "urls.txt"
lst = list(open(files))
lst = [l.replace("\n", "") for l in lst]
pool = mp.Pool(processes=4)
res = pool.map(getImages, lst)
print ("tempw")
Hope this solution helps others who are facing the same issue
It looks like you're facing a GIL issue : The python Global Interpreter Lock basically forbid python to do more than one task at the same time.
The Multiprocessing module is really launching separate instances of python to get the work done in parallel.
But in your case, urllib is called in all these instances : each of them is trying to lock the IO process : the one who succeed (e.g. come first) get you the result, while the others (trying to lock an already locked process) fail.
This is a very simplified explanation, but here are some additionnal ressources :
You can find another way to parallelize requests here : Multiprocessing useless with urllib2?
And more info about the GIL here : What is a global interpreter lock (GIL)?
I am trying to get familiar with the multiprocessing module. I am currently having some issues with Pipe(). I devised a small example to illustrate my problem.
I wrote two functions:
One that creates files in a specific folder (spawner)
And another that detects these files and copies them to another folder (cleaner)
They both work fine. I also managed to create a Process for both so that the creation and copying of the files happens simultaneously.
For the next step, I want the spawner to communicate to the cleaner that it has finished creating files so that the latter can terminate.
Here is the code:
import os
from time import sleep
import multiprocessing as mp
from shutil import copy2
def spawner(f_folder, pipeEnd):
template = 'my_file{}.txt'
for i in range(10):
new_file = os.path.join(f_folder, template.format(str(i)))
with open(new_file, 'w'):
pass
sleep(1)
pipeEnd.send(True)
return
def cleaner(f_folder, t_folder, pipeEnd):
state = set()
while not pipeEnd.recv():
new_files = set(os.listdir(f_folder)).difference(state)
state = set(os.listdir(f_folder))
for file in new_files:
copy2(os.path.join(f_folder, file), t_folder)
sleep(3)
return
if __name__ == '__main__':
receiver, sender = mp.Pipe()
from_folder = r'C:\Users\evkouni\Desktop\TEMP\PythonTests\subProcess\from'
to_folder = r'C:\Users\evkouni\Desktop\TEMP\PythonTests\subProcess\to'
p = mp.Process(target=spawner, args=(from_folder, sender))
q = mp.Process(target=cleaner, args=(from_folder, to_folder, receiver))
p.start()
q.start()
I just cannot seem to be able to get it to work.. Any help would be appreciated.
A Pipe is the wrong solution to your problem. You could use a pipe if you wanted to pass the file names from the spawner to the cleaner, but what you are trying to do is raise a flag. For that purpose, I would recommend the use of an Event: https://docs.python.org/2/library/multiprocessing.html#multiprocessing.Event
This can be considered a thread-safe (and multiprocess-safe) boolean. You would use it like
finished = mp.Event()
...
finished.set() # pipeEnd.send(True)
...
while not finished.is_set(): # while not receiver.recv():