Catching Errors in MultiProcessing Pool Map - python

I have a python code which uses multiprocessing Pool map. I am spawning multiple children from map, each of them reads a separate file, and I collect them in the end. My goal is to have a pandas dataframe in the end that is a concatenation of all the output from the children, with duplicates dropped. I use this dataframe to do more processing (the rest of the code seems unrelated to the question I ask here, so I am omitting that part for brevity). This code runs periodically at the end of the week with new input files to read every time. Sometimes there are errors in the files children read, like null values in integer columns, or missing files, etc.. If any of these errors occur, I want the main script to die, ideally as soon as possible. I do not know how to make this happen in the most efficient way.
I have tried, in turn:
1-Making the child die by a raising SystemExit(1) if it encounters an error. I couldn't make parent die.
2-Making child return an empty value or pandas dataframe in case of an error by try except blocks. I couldn't detect it properly in the parent.
3-Using map_async with callback functions instead of map.
The last one seems to work. However, I am not sure if this is the correct and most efficient way of doing this, as I do not use any output from the error callback function. Any comments and suggestions are appreciated.
Edit:
Sample input file: a.txt:
shipmentId,processing_time_epoch
4001,1455408024132
4231,1455408024373
b.txt:
shipmentId,processing_time_epoch
5001,1455408024132
4231,1455408024373
Desired final processing_time pandas dataframe:
shipmentId,processing_time_epoch
4001,1455408024132
4231,1455408024373
5001,1455408024132
My code:
import pandas as pd
import csv,glob,datetime,sys,pdb,subprocess,multiprocessing,io,os,shlex
from itertools import repeat
def myerrorcallback(x):
print('There seems to be an error in the child. Parent: Please die.')
return
def mycallback(x):
print('Returned successfully.')
return
def PrintException():
exc_type, exc_obj, tb = sys.exc_info()
f = tb.tb_frame
lineno = tb.tb_lineno
filename = f.f_code.co_filename
print('EXCEPTION IN ({}, LINE {} ): {} ({})'.format(filename, lineno, exc_obj, exc_type))
return
# ===================================================================
def Read_Processing_Times_v1(full_path_name):
try:
df = pd.read_csv(full_path_name,dtype={'shipmentId': pd.np.int64, 'processing_time_epoch': pd.np.int64}, usecols=['shipmentId','processing_time_epoch'])
return df.drop_duplicates()
except:
print("exception in file "+full_path_name)
PrintException()
raise(SystemExit(1))
# ===================================================================
def Read_Processing_Times_v2(full_path_name):
try:
df = pd.read_csv(full_path_name,dtype={'shipmentId': pd.np.int64, 'processing_time_epoch': pd.np.int64}, usecols=['shipmentId','processing_time_epoch'])
return df.drop_duplicates()
except:
print("exception in file "+full_path_name)
PrintException()
return pd.DataFrame()
# ===================================================================
def Read_Processing_Times_v3(full_path_name):
df = pd.read_csv(full_path_name,dtype={'shipmentId': pd.np.int64,'processing_time_epoch': pd.np.int64}, usecols=['shipmentId','processing_time_epoch'])
return df.drop_duplicates()
# ===========================================================================================================================
# Top-level
if __name__ == '__main__':
mycols = ['shipmentId', 'processing_time_epoch']
mydtypes = {'shipmentId': pd.np.int64, 'processing_time_epoch': pd.np.int64}
# The following two files should not give an error:
# files_to_read=["a.txt","b.txt"]
# The following two files should give an error, as a2.txt does not exist:
files_to_read=["a2.txt","b.txt"]
# version 1: Works with the correct files. Does not work if one of the children has an error: the child dies, the parent does not and waits forever.
# print("version 1")
# pool = multiprocessing.Pool(15)
# processing_times = pool.map(Read_Processing_Times_v1, files_to_read)
# pool.close()
# pool.join()
# processing_times = pd.concat(processing_times,ignore_index=True).drop_duplicates()
# print(processing_times)
# version 2: Does not work. Don't know how to fix it. The idea is make child return something, and catch the error in the parent.
# print("version 2")
# pool = multiprocessing.Pool(15)
# processing_times = pool.map(Read_Processing_Times_v2, files_to_read)
# pool.close()
# pool.join()
# if(processing_times.count(pd.DataFrame()) > 0):
# print("SLAM times are not read properly.")
# raise SystemExit(1)
# version 3:
print("version 3")
pool = multiprocessing.Pool(15)
processing_times = pool.map_async(Read_Processing_Times_v3, files_to_read,callback=mycallback,error_callback=myerrorcallback)
pool.close()
pool.join()
processing_times = processing_times.get()
processing_times = pd.concat(processing_times,ignore_index=True).drop_duplicates()
print("success!")
# Do more processing with processing_times after this line...

I think you could accomplish what you want by using the concurrent.futures module (https://docs.python.org/3/library/concurrent.futures.html). Below is an example from the doc page that I modified to be closer to your problem. In the example if work_func returns False that is considered an error and the program will terminate.
import sys
import concurrent.futures
import random
import time
def work_func(input_val):
"""
Do some work. Here a False value would mean there is an error
"""
time.sleep(0.5)
return random.choice([True, True, True, True, False])
if __name__ == "__main__":
# We can use a with statement to ensure processes are cleaned up promptly
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its input value
future_to_result = {executor.submit(work_func, val): val for val in range(30)}
# iterate over the futures as they become available
for future in concurrent.futures.as_completed(future_to_result):
# get the input value from the dict
input_val = future_to_result[future]
# now retrieve the result from the future
try:
data = future.result()
except Exception as exc:
print(input_val, data)
print('Something exceptional happend')
else:
print(input_val, data)
if not data:
print('Error - exiting')
sys.exit(1)
Sample output:
0 True
1 True
2 True
3 False
Error - exiting

Related

Python: store results of ProcessPoolExecutor

I'm very new to parallel processing with "concurrent.futures". Code seems to work, but I am not sure how to store the result of each process, therey by marking the build as failed at last, if any of processes's return value is not zero.
Tried to create a list (exit_status) and append the results to that, but that shows IndexError. Wondering what can I do right?
#!/usr/bin/env python3
import concurrent.futures
import sys
import shutil
import os
import glob
import multiprocessing as mp
import json
from os import path
def slave(path1, path2, target):
os.makedirs(target)
shutil.copy(path1, target)
shutil.copy(path2, target)
os.system(<Login command>)
os.system(<Image creation command>)
os.system(<Copy to Other slaves or NFS>)
#If any one of the above operation or command fails for any of the process, the script should return 1 at the end of the execution or fail the build at last.
def main():
processed = {}
exit_status = []
with open('example.json', 'r') as f:
data = json.load(f)
for value in data.items():
for line in value[1]:
if line.endswith('.zip'):
targz = line
elif line.endswith('.yaml'):
yaml = line
processed[targz] = yaml
with concurrent.futures.ProcessPoolExecutor() as executor:
for id, (path2, path1) in enumerate(processed.items(), 1):
target = path.join("/tmp", "dir" + str(id))
ret = executor.submit(slave, path1, path2, target)
exit_status.append(ret.result())
for i in exit_status:
print("##########Result status: ", i)
if __name__ == "__main__":
mp.set_start_method('spawn')
main()
exit_status list's output:
##########Result status: None
##########Result status: None
re; comments
If you want to get the result of a system call in order to act on the results of it, using subprocess.run is much more flexible and powerful than os.system. Additionally, if you actually want to perform the operations in parallel, you can't wait on result() after each task. Otherwise you're only ever doing one thing at a time. Better to submit all the tasks, and collect the Future objects. Then you can iterate over those and wait on each result() now that you've submitted all the work you want the executor to do.
def target_func(path1, path2, target):
#...
#instead of os.system, use subprocess.run
#you can inspect the stdout from the process
complete_process = subprocess.run(<Login command>, text=True, capture_output=True)
if "success" not in complete_process.stdout:
return "uh-oh"
#you can also just check the return value (0 typically means clean exit)
if subprocess.run(<Image creation command>).returncode == 0:
return "uh-oh"
#or you can tell `run` to generate an error if the returncode is non-zero
try:
subprocess.run(<Copy to Other slaves or NFS>, check=True)
except subprocess.CalledProcessError:
return "uh-oh"
return "we did it!"
def main():
#...
#...
with concurrent.futures.ProcessPoolExecutor() as executor:
for id, (path2, path1) in enumerate(processed.items(), 1):
target = path.join("/tmp", "dir" + str(id))
ret = executor.submit(slave, path1, path2, target)
exit_status.append(ret)
for i in exit_status:
print("##########Result status: ", i.result())

TypeError: 'MapResult' object is not iterable using pathos.multiprocessing

I'm running a spell correction function on a dataset I have. I used from pathos.multiprocessing import ProcessingPool as Pool to do the job. Once the processing is done, I'd like to actually access the results. Here is my code:
import codecs
import nltk
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
from pathos.multiprocessing import ProcessingPool as Pool
class SpellCorrect():
def load_data(self, path_1):
with codecs.open(path_1, "r", "utf-8") as file:
data = file.read()
return sent_tokenize(data)
def correct_spelling(self, data):
data = TextBlob(data)
return str(data.correct())
def run_clean(self, path_1):
pool = Pool()
data = self.load_data(path_1)
return pool.amap(self.correct_spelling, data)
if __name__ == "__main__":
path_1 = "../Data/training_data/training_corpus.txt"
SpellCorrect = SpellCorrect()
result = SpellCorrect.run_clean(path_1)
print(result)
result = " ".join(temp for temp in result)
with codecs.open("../Data/training_data/training_data_spell_corrected.txt", "a", "utf-8") as file:
file.write(result)
If you look at the main block, when I do print(result) I get an object of type <multiprocess.pool.MapResult object at 0x1a25519f28>.
I try to access the results with result = " ".join(temp for temp in result), but then I get the following error TypeError: 'MapResult' object is not iterable. I've tried typecasting it to a list list(result), but still the same error. What can I do to fix this?
The multiprocess.pool.MapResult object is not iterable as it is inherited from AsyncResult and has only the following methods:
wait([timeout])
Wait until the result is available or until timeout seconds pass. This method always returns None.
ready() Return whether the call has completed.
successful() Return whether the call completed without raising an
exception. Will raise AssertionError if the result is not ready.
get([timeout]) Return the result when it arrives. If timeout is not
None and the result does not arrive within timeout seconds then
TimeoutError is raised. If the remote call raised an exception then
that exception will be reraised as a RemoteError by get().
You can check the examples how to use the get() function here:
https://docs.python.org/2/library/multiprocessing.html#using-a-pool-of-workers
from multiprocessing import Pool, TimeoutError
import time
import os
def f(x):
return x*x
if __name__ == '__main__':
pool = Pool(processes=4) # start 4 worker processes
# print "[0, 1, 4,..., 81]"
print pool.map(f, range(10))
# print same numbers in arbitrary order
for i in pool.imap_unordered(f, range(10)):
print i
# evaluate "f(20)" asynchronously
res = pool.apply_async(f, (20,)) # runs in *only* one process
print res.get(timeout=1) # prints "400"
# evaluate "os.getpid()" asynchronously
res = pool.apply_async(os.getpid, ()) # runs in *only* one process
print res.get(timeout=1) # prints the PID of that process
# launching multiple evaluations asynchronously *may* use more processes
multiple_results = [pool.apply_async(os.getpid, ()) for i in range(4)]
print [res.get(timeout=1) for res in multiple_results]
# make a single worker sleep for 10 secs
res = pool.apply_async(time.sleep, (10,))
try:
print res.get(timeout=1)
except TimeoutError:
print "We lacked patience and got a multiprocessing.TimeoutError"

Capture check_output value

I am trying to capture the return value of check_output instead of having it automatically print to the command line. Unfortunately, my solution is not working and I'm not sure why. I've included my code and it's output:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from multiprocessing import Pool
from subprocess import check_output,CalledProcessError
def job(cmd):
result = ""
try:
result = check_output(cmd.split()) # Split string into list.
print("job result length = {0}".format(len(result)), file=sys.stdout)
except CalledProcessError as error:
raise Exception("Exit status of the child process: {0}\
Command used to spawn child process: {1}\
Output of the child process: {2}".format(error.returncode,error.cmd,error.output))
def main():
# Sets up a process pool. Defaults to number of cores.
# Each input gets passed to job and processed in a separate process.
p = Pool()
result = []
try:
# cmd_list is just a list of system commands which have been verified to work.
result = list(p.imap_unordered(job, cmd_list))
print("main result length = {0}".format(len(result)), file=sys.stdout)
print("{0}".format(result), file=sys.stdout)
except Exception as error:
print("Error: {0}. Aborting...".format(error), file=sys.stderr)
p.close()
p.terminate()
else:
p.close()
p.join()
if __name__ == '__main__':
main()
Output
In addition to the output of each command executed by check_output, my print statements reveal some unexpected results:
job result length = 0
job result length = 0
main result length = 2
[None, None]
I would expect job result length to equal 2 and result to contain the return values of the child processes.
result is a local variable. Either return it:
def job(cmd):
# something goes here
return result
Or make it global:
result = ""
def job(cmd):
global result
# something goes here
result = whatever it shall be.
Or parameterize it:
def job(cmd, result):
result = whatever it shall be.

Creating loop for __main__

I am new to Python, and I want your advice on something.
I have a script that runs one input value at a time, and I want it to be able to run a whole list of such values without me typing the values one at a time. I have a hunch that a "for loop" is needed for the main method listed below. The value is "gene_name", so effectively, i want to feed in a list of "gene_names" that the script can run through nicely.
Hope I phrased the question correctly, thanks! The chunk in question seems to be
def get_probes_from_genes(gene_names)
import json
import urllib2
import os
import pandas as pd
api_url = "http://api.brain-map.org/api/v2/data/query.json"
def get_probes_from_genes(gene_names):
if not isinstance(gene_names,list):
gene_names = [gene_names]
#in case there are white spaces in gene names
gene_names = ["'%s'"%gene_name for gene_name in gene_names]**
api_query = "?criteria=model::Probe"
api_query= ",rma::criteria,[probe_type$eq'DNA']"
api_query= ",products[abbreviation$eq'HumanMA']"
api_query= ",gene[acronym$eq%s]"%(','.join(gene_names))
api_query= ",rma::options[only$eq'probes.id','name']"
data = json.load(urllib2.urlopen(api_url api_query))
d = {probe['id']: probe['name'] for probe in data['msg']}
if not d:
raise Exception("Could not find any probes for %s gene. Check " \
"http://help.brain- map.org/download/attachments/2818165/HBA_ISH_GeneList.pdf? version=1&modificationDate=1348783035873 " \
"for list of available genes."%gene_name)
return d
def get_expression_values_from_probe_ids(probe_ids):
if not isinstance(probe_ids,list):
probe_ids = [probe_ids]
#in case there are white spaces in gene names
probe_ids = ["'%s'"%probe_id for probe_id in probe_ids]
api_query = "? criteria=service::human_microarray_expression[probes$in%s]"% (','.join(probe_ids))
data = json.load(urllib2.urlopen(api_url api_query))
expression_values = [[float(expression_value) for expression_value in data["msg"]["probes"][i]["expression_level"]] for i in range(len(probe_ids))]
well_ids = [sample["sample"]["well"] for sample in data["msg"] ["samples"]]
donor_names = [sample["donor"]["name"] for sample in data["msg"] ["samples"]]
well_coordinates = [sample["sample"]["mri"] for sample in data["msg"] ["samples"]]
return expression_values, well_ids, well_coordinates, donor_names
def get_mni_coordinates_from_wells(well_ids):
package_directory = os.path.dirname(os.path.abspath(__file__))
frame = pd.read_csv(os.path.join(package_directory, "data", "corrected_mni_coordinates.csv"), header=0, index_col=0)
return list(frame.ix[well_ids].itertuples(index=False))
if __name__ == '__main__':
probes_dict = get_probes_from_genes("SLC6A2")
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)
whoa, first things first. Python ain't Java, so do yourself a favor and use a nice """xxx\nyyy""" string, with triple quotes to multiline.
api_query = """?criteria=model::Probe"
,rma::criteria,[probe_type$eq'DNA']
...
"""
or something like that. you will get white spaces as typed, so you may need to adjust.
If, like suggested, you opt to loop on the call to your function through a file, you will need to either try/except your data-not-found exception or you will need to handle missing data without throwing an exception. I would opt for returning an empty result myself and letting the caller worry about what to do with it.
If you do opt for raise-ing an Exception, create your own, rather than using a generic exception. That way your code can catch your expected Exception first.
class MyNoDataFoundException(Exception):
pass
#replace your current raise code with...
if not d:
raise MyNoDataFoundException(your message here)
clarification about catching exceptions, using the accepted answer as a starting point:
if __name__ == '__main__':
with open(r"/tmp/genes.txt","r") as f:
for line in f.readlines():
#keep track of your input data
search_data = line.strip()
try:
probes_dict = get_probes_from_genes(search_data)
except MyNoDataFoundException, e:
#and do whatever you feel you need to do here...
print "bummer about search_data:%s:\nexception:%s" % (search_data, e)
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)
You may want to create a file with Gene names, then read content of the file and call your function in the loop. Here is an example below
if __name__ == '__main__':
with open(r"/tmp/genes.txt","r") as f:
for line in f.readlines():
probes_dict = get_probes_from_genes(line.strip())
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)

Successive multiprocessing

I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.

Categories

Resources