Python 2.7: How to compensate for missing pool.starmap? - python

I have defined this function
def writeonfiles(a,seed):
random.seed(seed)
f = open(a, "w+")
for i in range(0,10):
j = random.randint(0,10)
#print j
f.write(j)
f.close()
Where a is a string containing the path of the file and seed is an integer seed.
I want to parallelize a simple program in such a way that each core takes one of the available paths that I give in, seeds its random generator and write some random numbers on that files, so, for example, if I pass the
vector
vector = [Test/file1.txt, Test/file2.txt]
and the seeds
seeds = (123412, 989898),
it gives to the first available core the function
writeonfiles(Test/file1.txt, 123412)
and to the second one the same function with different arguments:
writeonfiles(Test/file2.txt, 989898)
I have looked through a lot of similar questions here on Stackoverflow, but I cannot make any solution work.
What I tried is:
def writeonfiles_unpack(args):
return writeonfiles(*args)
if __name__ == "__main__":
folder = ["Test/%d.csv" %i for i in range(0,4)]
seed = [234124, 663123, 12345 ,123833]
p = multiprocessing.Pool()
p.map(writeonfiles, (folder,seed))
and gives me TypeError: writeonfiles() takes exactly 2 arguments (1 given).
I tried also
if __name__ == "__main__":
folder = ["Test/%d.csv" %i for i in range(0,4)]
seed = [234124, 663123, 12345 ,123833]
p = multiprocessing.Process(target=writeonfiles, args= [folder,seed])
p.start()
But it gives me
File "/usr/lib/python2.7/random.py", line 120, in seed
super(Random, self).seed(a)
TypeError: unhashable type: 'list'
Finally, I tried the contextmanager
#contextmanager
def poolcontext(*args, **kwargs):
pool = multiprocessing.Pool(*args, **kwargs)
yield pool
pool.terminate()
if __name__ == "__main__":
folder = ["Test/%d" %i for i in range(0,4)]
seed = [234124, 663123, 12345 ,123833]
a = zip(folder, seed)
with poolcontext(processes = 3) as pool:
results = pool.map(writeonfiles_unpack,a )
and it results in
File "/usr/lib/python2.7/multiprocessing/pool.py", line 572, in get
raise self._value
TypeError: 'module' object is not callable

Python 2.7 lacks the starmap pool-method from Python 3.3+ . You can overcome this by decorating your target function with a wrapper, which unpacks the argument-tuple and calls the target function:
import os
from multiprocessing import Pool
import random
from functools import wraps
def unpack(func):
#wraps(func)
def wrapper(arg_tuple):
return func(*arg_tuple)
return wrapper
#unpack
def write_on_files(a, seed):
random.seed(seed)
print("%d opening file %s" % (os.getpid(), a)) # simulate
for _ in range(10):
j = random.randint(0, 10)
print("%d writing %d to file %s" % (os.getpid(), j, a)) # simulate
if __name__ == '__main__':
folder = ["Test/%d.csv" % i for i in range(0, 4)]
seed = [234124, 663123, 12345, 123833]
arguments = zip(folder, seed)
pool = Pool(4)
pool.map(write_on_files, iterable=arguments)
pool.close()
pool.join()

Related

python ProcessPoolExecutor do not work when in function

python ProcessPoolExecutor works in command lines but not running after adding to a function
it is working like this
from concurrent import futures
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func,paras,threads)
but not working at all as below
def upper(paras,threads):
def func:
some func
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func,paras,threads)
return p
p = upper(paras,threads)
no warning or error but without any response for a long time.
Your do get an error. Its.
AttributeError: Can't pickle local object 'upper.<locals>.func'.
The reason is for multiprocessing to work it needs the function to be defined at the global level.
To achieve what you want you can do the following:
from concurrent import futures
# Has to be a global function
def func(para):
print(para)
def upper(paras,threads):
# This cannot be a local function.
#def func(para):
# print(para)
def multi_process(func, paras, threads):
with futures.ProcessPoolExecutor(max_workers=threads) as pool:
res = pool.map(func, paras, chunksize=threads)
return list(res)
p = multi_process(func, paras, threads)
return p
paras = [1, 2, 3]
threads = 3
p = upper(paras,threads)

TypeError: 'MapResult' object is not iterable using pathos.multiprocessing

I'm running a spell correction function on a dataset I have. I used from pathos.multiprocessing import ProcessingPool as Pool to do the job. Once the processing is done, I'd like to actually access the results. Here is my code:
import codecs
import nltk
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
from pathos.multiprocessing import ProcessingPool as Pool
class SpellCorrect():
def load_data(self, path_1):
with codecs.open(path_1, "r", "utf-8") as file:
data = file.read()
return sent_tokenize(data)
def correct_spelling(self, data):
data = TextBlob(data)
return str(data.correct())
def run_clean(self, path_1):
pool = Pool()
data = self.load_data(path_1)
return pool.amap(self.correct_spelling, data)
if __name__ == "__main__":
path_1 = "../Data/training_data/training_corpus.txt"
SpellCorrect = SpellCorrect()
result = SpellCorrect.run_clean(path_1)
print(result)
result = " ".join(temp for temp in result)
with codecs.open("../Data/training_data/training_data_spell_corrected.txt", "a", "utf-8") as file:
file.write(result)
If you look at the main block, when I do print(result) I get an object of type <multiprocess.pool.MapResult object at 0x1a25519f28>.
I try to access the results with result = " ".join(temp for temp in result), but then I get the following error TypeError: 'MapResult' object is not iterable. I've tried typecasting it to a list list(result), but still the same error. What can I do to fix this?
The multiprocess.pool.MapResult object is not iterable as it is inherited from AsyncResult and has only the following methods:
wait([timeout])
Wait until the result is available or until timeout seconds pass. This method always returns None.
ready() Return whether the call has completed.
successful() Return whether the call completed without raising an
exception. Will raise AssertionError if the result is not ready.
get([timeout]) Return the result when it arrives. If timeout is not
None and the result does not arrive within timeout seconds then
TimeoutError is raised. If the remote call raised an exception then
that exception will be reraised as a RemoteError by get().
You can check the examples how to use the get() function here:
https://docs.python.org/2/library/multiprocessing.html#using-a-pool-of-workers
from multiprocessing import Pool, TimeoutError
import time
import os
def f(x):
return x*x
if __name__ == '__main__':
pool = Pool(processes=4) # start 4 worker processes
# print "[0, 1, 4,..., 81]"
print pool.map(f, range(10))
# print same numbers in arbitrary order
for i in pool.imap_unordered(f, range(10)):
print i
# evaluate "f(20)" asynchronously
res = pool.apply_async(f, (20,)) # runs in *only* one process
print res.get(timeout=1) # prints "400"
# evaluate "os.getpid()" asynchronously
res = pool.apply_async(os.getpid, ()) # runs in *only* one process
print res.get(timeout=1) # prints the PID of that process
# launching multiple evaluations asynchronously *may* use more processes
multiple_results = [pool.apply_async(os.getpid, ()) for i in range(4)]
print [res.get(timeout=1) for res in multiple_results]
# make a single worker sleep for 10 secs
res = pool.apply_async(time.sleep, (10,))
try:
print res.get(timeout=1)
except TimeoutError:
print "We lacked patience and got a multiprocessing.TimeoutError"

Copy parameters into list

I am trying to copy parameters passed into a python script to a file. Here is the parameters.
["0013","1","1","\"john.dow#gmail.com\"","1","P123-ND 10Q","10Q H??C"]
I understand that there is a buffer problem and I am getting bad data into my parameters. However, I do not have control over what is being passed in. I am trying to copy, starting at the 5th parameter, the parameters into a file.
f = open(in_file_name, 'w')
for x in range(5, len(arg_list)):
f.write(arg_list[x] + '\n')
f.close()
The result of the file is below:
P123-ND 10Q
10Q H??C
Here is what it should be:
P123-ND
10Q
How can I not include the bad data? What is happening to the spaces between the valid information and the bad information?
As requested, here is the full program:
#!/bin/python
class Argument_Indices:
PRINTER_INDEX = 0
AREA_INDEX = 1
LABEL_INDEX = 2
EMAIL_INDEX = 3
RUN_TYPE_INDEX = 4
import argparse
import json
import os
from subprocess import call
import sys
from time import strftime
def _handle_args():
''' Setup and run argpars '''
parser = argparse.ArgumentParser(description='Set environment variables for and to call Program')
parser.add_argument('time_to_run', default='NOW', choices=['NOW', 'EOP'], help='when to run the report')
parser.add_argument('arguments', nargs='+', help='the remaining command line arguments')
return parser.parse_args()
def _proces_program(arg_list):
time_stamp = strftime("%d_%b_%Y_%H_%M_%S")
printer = arg_list[Argument_Indices.PRINTER_INDEX]
area = arg_list[Argument_Indices.AREA_INDEX]
label = arg_list[Argument_Indices.LABEL_INDEX]
in_file_name = "/tmp/program{0}.inp".format(time_stamp)
os.environ['INPUT_FILE'] = in_file_name
f = open(in_file_name, 'w')
for x in range(5, len(arg_list)):
f.write(arg_list[x])
f.close()
call(['./Program.bin', printer, area, label])
os.remove(in_file_name)
def main():
''' Main Function '''
arg_list = None
args = _handle_args()
if len(args.arguments) < 1:
print('Missing name of input file')
return -1
with open(args.arguments[0]) as input_file:
arg_list = json.load(input_file)
_process_program(arg_list)
return 0
if __name__ == '__main__':
if main() != 0:
print('Program run failed')
sys.exit()
For your exact case (where you're getting duplicated parameters received with some spaces in between) this would work:
received_param_list = ["0013","1","1","\"john.dow#gmail.com\"","1","P123-ND 10Q","10Q H??C"]
arg_list = [i.split(" ")[0] for i in received_param_list]
last_param = received_param_list[-1].split()[-1]
if last_param != arg_list[-1]:
arg_list.append(last_param)
for x in range(5, len(arg_list)):
print (arg_list[x])
Although there might be another simpler way

Python/Multiprocessing : Processes does not seem to start

I have a function which reads a binary file and converts each byte into a corresponding sequence of characters. For example, 0x05 becomes 'AACC', 0x2A becomes 'AGGG' etc...The function which reads the file and converts the bytes is currently a linear one and since the files to convert are anywhere between 25kb and 2Mb, this can take quite a while.
Therefore, I'm trying to use multiprocessing to divide the task and hopefully improve speed. However, I just can't get it to work. Below is the linear function, which works, albeit slowly;
def fileToRNAString(_file):
if (_file and os.path.isfile(_file)):
rnaSequences = []
blockCount = 0
blockSize = 2048
printAndLog("!", "Converting %s into RNA string (%d bytes/block)" % (_file, blockSize))
with open(_file, "rb") as hFile:
buf = hFile.read(blockSize)
while buf:
decSequenceToRNA(blockCount, buf, rnaSequences)
blockCount = blockCount + 1
buf = hFile.read(blockSize)
else:
printAndLog("-", "Could not find the specified file. Please verify that the file exists:" + _file)
return rnaSequences
Note: The function 'decSequenceToRNA' takes the buffer read and converts each byte to the required string. Upon execution, the function returns a tuple which contain the block number and the string, e.g. (1, 'ACCGTAGATTA...') and at the end, I have an array of these tuples available.
I've tried to convert the function to use the multiprocessing of Python;
def fileToRNAString(_file):
rnaSequences = []
if (_file and os.path.isfile(_file)):
blockCount = 0
blockSize = 2048
printAndLog("!", "Converting %s into RNA string (%d bytes/block)" % (_file, blockSize))
workers = []
with open(_file, "rb") as hFile:
buf = hFile.read(blockSize)
while buf:
p = Process(target=decSequenceToRNA, args=(blockCount, buf, rnaSequences))
p.start()
workers.append(p)
blockCount = blockCount + 1
buf = hFile.read(blockSize)
for p in workers:
p.join()
else:
printAndLog("-", "Could not find the specified file. Please verify that the file exists:" + _file)
return rnaSequences
However, no processes seems to even start, as when this function is ran, an empty array is returned. Any message printed to the console in 'decSequenceToRNA' is not displayed;
>>>fileToRNAString(testfile)
[!] Converting /root/src/amino56/M1H2.bin into RNA string (2048 bytes/block).
Unlike this question here, I'm running Linux shiva 3.14-kali1-amd64 #1 SMP Debian 3.14.5-1kali1 (2014-06-07) x86_64 GNU/Linux and using PyCrust to test the functions on Python Version: 2.7.3. I'm using the following packages:
import os
import re
import sys
import urllib2
import requests
import logging
import hashlib
import argparse
import tempfile
import shutil
import feedparser
from multiprocessing import Process
I'd like help to figure out why my code does not work, of if I'm missing something elsewhere to make the Process works. Also open to suggestions for improving the code. Below is 'decSequenceToRNA' for reference:
def decSequenceToRNA(_idxSeq, _byteSequence, _rnaSequences):
rnaSequence = ''
printAndLog("!", "Processing block %d (%d bytes)" % (_idxSeq, len(_byteSequence)))
for b in _byteSequence:
rnaSequence = rnaSequence + base10ToRNA(ord(b))
printAndLog("+", "Block %d completed. RNA of %d nucleotides generated." % (_idxSeq, len(rnaSequence)))
_rnaSequences.append((_idxSeq, rnaSequence))
decSequenceToRNA is running in its own process, which means it gets its own, separate copy of every data structure in the main process. That means that when you append to _rnaSequences in decSequenceToRNA, it's has no effect on rnaSequences in the parent process. That would explain why an empty list is being returned.
You have two options to address this. First, is to create a list that can be shared between processes using multiprocessing.Manager. For example:
import multiprocessing
def f(shared_list):
shared_list.append(1)
if __name__ == "__main__":
normal_list = []
p = multiprocessing.Process(target=f, args=(normal_list,))
p.start()
p.join()
print(normal_list)
m = multiprocessing.Manager()
shared_list = m.list()
p = multiprocessing.Process(target=f, args=(shared_list,))
p.start()
p.join()
print(shared_list)
Output:
[] # Normal list didn't work, the appended '1' didn't make it to the main process
[1] # multiprocessing.Manager() list works fine
Applying this to your code would just require replacing
rnaSequences = []
With
m = multiprocessing.Manager()
rnaSequences = m.list()
Alternatively, you could (and probably should) use a multiprocessing.Pool instead of creating individual Process for each chunk. I'm not sure how large hFile is or how big the chunks you're reading are, but if there are more than multiprocessing.cpu_count() chunks, you're going to hurt performance by spawning processes for every chunk. Using a Pool, you can keep your process count constant, and easily create your rnaSequence list:
def decSequenceToRNA(_idxSeq, _byteSequence):
rnaSequence = ''
printAndLog("!", "Processing block %d (%d bytes)" % (_idxSeq, len(_byteSequence)))
for b in _byteSequence:
rnaSequence = rnaSequence + base10ToRNA(ord(b))
printAndLog("+", "Block %d completed. RNA of %d nucleotides generated." % (_idxSeq, len(rnaSequence)))
return _idxSeq, rnaSequence
def fileToRNAString(_file):
rnaSequences = []
if (_file and os.path.isfile(_file)):
blockCount = 0
blockSize = 2048
printAndLog("!", "Converting %s into RNA string (%d bytes/block)" % (_file, blockSize))
results = []
p = multiprocessing.Pool() # Creates a pool of cpu_count() processes
with open(_file, "rb") as hFile:
buf = hFile.read(blockSize)
while buf:
result = pool.apply_async(decSequenceToRNA, blockCount, buf)
results.append(result)
blockCount = blockCount + 1
buf = hFile.read(blockSize)
rnaSequences = [r.get() for r in results]
pool.close()
pool.join()
else:
printAndLog("-", "Could not find the specified file. Please verify that the file exists:" + _file)
return rnaSequences
Note that we no longer pass the rnaSequences list to the child. Instead, we just return the result we would have appened back to the parent (which we can't do with Process), and build the list there.
Try writing this (comma at the end of the parameter list)
p = Process(target=decSequenceToRNA, args=(blockCount, buf, rnaSequences,))

Successive multiprocessing

I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.

Categories

Resources