Cycle an iterator using multiprocessing in Python - python

I have an iterator that will retrive various number of lines from a very large (>20GB) file depend on some features. The iterator works fine, but I can only use 1 thread to process the result. I would like to feed the value from each iteration to multiple threads / processes.
I'm using a text file with 9 lines to mimic my data, here is my code. I've been struggling on how to create the feedback so when one process finished, it will go and retrive the next iteration:
from multiprocessing import Process, Manager
import time
# Iterator
class read_file(object):
def __init__(self, filePath):
self.file = open(filePath, 'r')
def __iter__(self):
return self
def __next__(self):
line = self.file.readline()
if line:
return line
else:
raise StopIteration
# worker for one process
def print_worker(a, n, stat):
print(a)
stat[n] = True # Set the finished status as True
return None
# main
def main():
file_path = 'tst_mp.txt' # the txt file wit 9 lines
n_worker = 2
file_handle = read_file(file_path)
workers = []
# Create shared list for store dereplicated dict and progress counter
manager = Manager()
status = manager.list([False] * 2) # list of dictonary for each thread
# Initiate the workers
for i in range(n_worker):
workers.append(Process(target=print_worker, args=(file_handle.__next__(), i, status,)))
for worker in workers:
worker.start()
block = file_handle.__next__() # The next block (line)
while block: # continue is there is still block left
print(status)
time.sleep(1) # for every second
for i in range(2):
if status[i]: # Worker i finished
workers[i].join()
# workers[i].close()
workers[i] = Process(target=print_worker, args=(block, i, status,))
status[i] = False # Set worker i as busy (False)
workers[i].start() # Start worker i
try: # try to get the next item in the iterator
block = file_handle.__next__()
except StopIteration:
block = False
if __name__ == '__main__':
main()
The code is clumsy, but it did print out the sequence, but also with some error when I ran the code twice:
1
2
3
4
5
6
7
8
9
Process Process-10:
Traceback (most recent call last):
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
conn = self._tls.connection
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/zewei/share/paf_depth/test_multiprocess.py", line 31, in print_worker
stat[n] = True # Set the finished status as True
File "<string>", line 2, in __setitem__
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
self._connect()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 794, in _connect
dispatch(conn, None, 'accept_connection', (name,))
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 90, in dispatch
kind, result = c.recv()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 255, in recv
buf = self._recv_bytes()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
buf = self._recv(4)
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 384, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Here is where I'm stucked, I was wondering if there is any fix or more elegant way for this?
Thanks!

Here's a better way to do what you are doing, using pool:
from multiprocessing import Pool
import time
.
.
.
.
# worker for one process
def print_worker(a):
print(a)
return None
def main():
file_path = r'' # the txt file wit 9 lines
n_worker = 2
file_handle = read_file(file_path)
results = []
with Pool(n_worker) as pool:
for result in pool.imap(print_worker, file_handle):
results.append(result)
print(results)
if __name__ == '__main__':
main()
Here, the imap function lazily iterates over the iterator, so that the whole file won't be read into memory. Pool handles spreading the tasks across the number of processes you started (using n_worker) automatically so that you don't have to manage it yourself.

Related

Semaphore stuck in python code, whats wrong?

I am trying to copy some files to an OCI bucket (Oracle Cloud Infrastructure).
The fist 5 files are succefully copied, but then the script hangs and the processes on the task manager dies, remaining only the main one.
from array import array
from pathlib import Path
import oci
import datetime
from multiprocessing import Process
import threading
import logging
from oci.object_storage import UploadManager
from oci.object_storage.models import CreateBucketDetails
from oci.object_storage.transfer.constants import MEBIBYTE
logging.basicConfig(filename=r'############',filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
# Number of max processes allowed at a time
concurrency= 5
sema = threading.BoundedSemaphore(concurrency)
# The root directory path, Replace with your path
p = Path(r"#####")
# The Compartment OCID
compartment_id = "#######"
# The Bucket name where we will upload
bucket_name = "######"
config = oci.config.from_file()
object_storage_client = oci.object_storage.ObjectStorageClient(config)
part_size = 2 * MEBIBYTE
today = datetime.date.today()
today = str(today)
def upload_to_object_storage(path:str,name:str,namespace):
#upload_manager = UploadManager(object_storage_client, allow_parallel_uploads=False)
with open(path, "rb") as in_file:
logging.info("Starting upload {}".format(name))
object_storage_client.put_object(namespace,bucket_name,name,in_file)
#upload_manager.upload_file(namespace, bucket_name, name, in_file.name, part_size=part_size)
logging.info("Finished uploading {}".format(name))
sema.release()
return
def createUploadProcess(object:Path,object_storage_client,namespace,proc_list):
name = object.relative_to(p).as_posix()
sema.acquire()
process = Process(target=upload_to_object_storage, args=(object.as_posix(),name,namespace))
proc_list.append(process)
process.start()
def processDirectoryObjects(object:Path,object_storage_client,namespace,proc_list):
if object.is_file():
createUploadProcess(object,object_storage_client,namespace,proc_list)
def processDirectory(path:Path,object_storage_client,namespace,proc_list):
if path.exists():
logging.info("in directory ---- " + path.relative_to(p).as_posix())
for objects in path.iterdir():
if objects.is_dir():
processDirectory(objects,object_storage_client,namespace,proc_list)
else:
if today in objects.name:
processDirectoryObjects(objects,object_storage_client,namespace,proc_list)
if __name__ == '__main__':
config = config
object_storage_client = object_storage_client
sema = sema
namespace = object_storage_client.get_namespace().data
proc_list: array = []
if p.exists() and p.is_dir():
processDirectory(p,object_storage_client,namespace,proc_list)
for job in proc_list:
job.join()
I have aproximaly 50 files to copy, but it uploads 5 and then hangs. The execution presents the following error for the 5 processes:
Process Process-1:
Traceback (most recent call last):
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 258, in _bootstrap
self.run()
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\#######\Documents\copia_bkp_oci2.py", line 49, in upload_to_object_storage
sema.release()
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\threading.py", line 482, in release
raise ValueError("Semaphore released too many times")
ValueError: Semaphore released too many times

How do I nest multiprocessing in multiprocessing, with common variables (python)?

I have a function which is running twice in two parallel processes. Lets call it - parentFunction().
Each process ends with a dictionary which is added to a common list which gives a list of two dictionaries. This I solved by using preset list using manager.
Now, inside parentFunction() L would like to run two parallel processes, each gives one variable to the dictionary. I tried to do this with preset dictionary using manager
At the end I`m converting the list of dictionaries to pandas data frame.
def I(D, a):
D["a"] = a
def II(D, b):
D["a"] = b
def task(L, x):
x = 0
a = 1
b = 2
manager = Manager()
D = manager.dict() # <-- can be shared between processes.
pI = Process(target=I, args=(D, 0))
pII = Process(target=II, args=(D, 0))
pI.start()
pII.start()
pI.join()
pII.join()
L.append(D)
if __name__ == "__main__":
with Manager() as manager:
L = manager.list() # <-- can be shared between processes.
p1 = Process(target=task, args=(L, 0)) # Passing the list
p2 = Process(target=task, args=(L, 0)) # Passing the list
p1.start()
p2.start()
p1.join()
p2.join()
print(L)
returns error:
TypeError: task() missing 1 required positional argument: 'L'
Traceback (most recent call last):
File "C:\Users\user\AppData\Roaming\JetBrains\PyCharmCE2021.2\scratches\scratch_8.py", line 88, in <module>
print(list(L))
File "<string>", line 2, in __getitem__
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\multiprocessing\managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\multiprocessing\connection.py", line 256, in recv
return _ForkingPickler.loads(buf.getbuffer())
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\multiprocessing\managers.py", line 934, in RebuildProxy
return func(token, serializer, incref=incref, **kwds)
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\multiprocessing\managers.py", line 784, in __init__
self._incref()
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\multiprocessing\managers.py", line 838, in _incref
conn = self._Client(self._token.address, authkey=self._authkey)
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\multiprocessing\connection.py", line 505, in Client
c = PipeClient(address)
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\multiprocessing\connection.py", line 707, in PipeClient
_winapi.WaitNamedPipe(address, 1000)
FileNotFoundError: [WinError 2] The system cannot find the file specified
```
The source you posted does not seem to match your stack trace. You would only get a FileNotFoundException when the main process tries to enumerate any objects within list L with a statement such as print(list(L)), which I see in the stack trace but not in your code. It helps when you post the actual code causing the exception. But here is the cause of your problem:
When you create a new manager with the call manager = Manager() a new process is created and any objects that are created via the manager "live" in the same address space and process as that manager. You are creating two manager processes, once in the main process and once in the child process task. It is in the latter that the dictionary, D is created. When that process terminates the manager process terminates too along with any objects created by that manager. So when the main process attempts to print the list L, the proxy object within it, D, no longer points to an existing object. The solution is to have the main process create the dictionary, D, and pass it to the task child process:
from multiprocessing import Process, Manager
def I(D, a):
D["a"] = a
def II(D, b):
D["a"] = b
def task(L, D, x):
x = 0
a = 1
b = 2
pI = Process(target=I, args=(D, 0))
pII = Process(target=II, args=(D, 0))
pI.start()
pII.start()
pI.join()
pII.join()
L.append(D)
if __name__ == "__main__":
with Manager() as manager:
L = manager.list() # <-- can be shared between processes.
D = manager.dict() # <-- can be shared between processes.
p = Process(target=task, args=(L, D, 0)) # Passing the list
p.start()
p.join()
print(L[0])
Prints:
{'a': 0}

Multiprocessing using partial() throws ForkingPickler error

I am trying to crawl abstracts from PubMed and filtering them using regex via python. To speed things up, I wanted to use pythons multiprocessing pool.
My code looks like the following:
import multiprocessing as mp
from functools import partial
from typing import List, Tuple
def beautify_abstract(abstract: str, regex: str):
import re
result: str = ""
last_start = 0
matches = re.finditer(regex, abstract, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result += abstract[last_start:match.start()]
result += "<b>"
result += abstract[match.start():match.end()]
result += "</b>"
last_start = match.end()
result += abstract[last_start:]
return result
def get_doi(pim: str, regex: str):
from Bio import Entrez
from Bio.Entrez import efetch
import re
from metapub.convert import pmid2doi
Entrez.email = "Your.Name.Here#example.org"
print(f"Processing {pim}")
abstract_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='all')
abstract = abstract_handle.read()
abstract_handle.close()
if re.search(regex, abstract, re.MULTILINE) is not None:
docsum_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='docsum').read()
docsum = docsum_handle.read()
try:
doi = pmid2doi(pim)
except:
doi = "UNKNOWN"
return f"{doi}"
return ""
def get_pim_with_regex_list(keywords: List[str]) -> List[str]:
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
searchterm = " ".join(keywords)
pims = []
handle = Entrez.esearch(db="pubmed", retstart=0, retmax=0, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
count = int(record['Count'])
if count > 100000:
retmax = 100000
else:
retmax = count
retstart = 0
while retstart < count:
handle = Entrez.esearch(db="pubmed", retstart=retstart, retmax=retmax, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
for pim in record['IdList']:
pims.append(pim)
retstart += retmax
return pims
if __name__ == '__main__':
keywords = ["keyword1", "keyword2"]
pim_list = get_pim_with_regex_list(keywords)
regex = "keyword1 keyword2"
worker_fn = partial(get_doi, regex=regex)
pool = mp.Pool(mp.cpu_count())
entries = pool.map(worker_fn, pim_list)
pool.close()
pool.join()
When I run the given code, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
Process ForkPoolWorker-4:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
I did some digging into multiprocessing with python and found out that only python native types are supported as parameters (enforced by the ForkingPickler).
Assuming that str is a native type, the code should work... Currently, I am completely lost and have no idea what may be the problem.
As suggested, I uploaded a minimal (sequential) working example here
Is there any way to fix this problem or at least diagnose the real issue here?

Multiprocess, various process reading the same file

I am trying to simulate some dna-sequencing reads, and,in order to speed-up the code, I am in need to run it on parallel.
Basically, what I am trying to do is the following:I am sampling reads from the human genome, and I think that one the two process from multiprocessing module try to get data from the same file (the human genome) the processes gets corrupted and it is not able to get the desired DNA sequence. I have tried different things, but I am very new to parallel programming and I cannot solve my problem
When I run the script with one core it works fine.
This is the way I am calling the function
if __name__ == '__main__':
jobs = []
# init the processes
for i in range(number_of_cores):
length= 100
lock = mp.Manager().Lock()
p = mp.Process(target=simulations.sim_reads,args=(lock,FastaFile, "/home/inigo/msc_thesis/genome_data/hg38.fa",length,paired,results_dir,spawn_reads[i],temp_file_names[i]))
jobs.append(p)
p.start()
for p in jobs:
p.join()
And this is the function I am using to get the reads, were each process writes the data to a different file.
def sim_single_end(lc,fastafile,chr,chr_pos_start,chr_pos_end,read_length, unique_id):
lc.acquire()
left_split_read = fastafile.fetch(chr, chr_pos_end - (read_length / 2), chr_pos_end)
right_split_read = fastafile.fetch(chr, chr_pos_start, chr_pos_start + (read_length / 2))
reversed_left_split_read = left_split_read[::-1]
total_read = reversed_left_split_read + right_split_read
seq_id = "id:%s-%s|left_pos:%s-%s|right:%s-%s " % (unique_id,chr, int(chr_pos_end - (read_length / 2)), int(chr_pos_end), int(chr_pos_start),int(chr_pos_start + (read_length / 2)))
quality = "I" * read_length
fastq_string = "#%s\n%s\n+\n%s\n" % (seq_id, total_read, quality)
lc.release()
new_record = SeqIO.read(StringIO(fastq_string), "fastq")
return(new_record)
Here is the traceback:
Traceback (most recent call last):
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/inigo/Dropbox/PycharmProjects/circ_dna/simulations.py", line 107, in sim_ecc_reads
new_read = sim_single_end(lc,fastafile, chr, chr_pos_start, chr_pos_end, read_length, read_id)
File "/home/inigo/Dropbox/PycharmProjects/circ_dna/simulations.py", line 132, in sim_single_end
new_record = SeqIO.read(StringIO(fastq_string), "fastq")
File "/usr/local/lib/python3.5/dist-packages/Bio/SeqIO/__init__.py", line 664, in read
first = next(iterator)
File "/usr/local/lib/python3.5/dist-packages/Bio/SeqIO/__init__.py", line 600, in parse
for r in i:
File "/usr/local/lib/python3.5/dist-packages/Bio/SeqIO/QualityIO.py", line 1031, in FastqPhredIterator
for title_line, seq_string, quality_string in FastqGeneralIterator(handle):
File "/usr/local/lib/python3.5/dist-packages/Bio/SeqIO/QualityIO.py", line 951, in FastqGeneralIterator
% (title_line, seq_len, len(quality_string)))
ValueError: Lengths of sequence and quality values differs for id:6-chr1_KI270707v1_random|left_pos:50511537-50511587|right:50511214-50511264 (0 and 100).
I am the OP of this answer that I did almost a year ago. The problem was that the package that I was using for reading the human genome file (pysam) was failing. The issue was a typo when calling multiprocessing.
From the authors respose, this should work:
p = mp.Process(target=get_fasta, args=(genome_fa,))
note the ',' to ensure you pass a tuple
See https://github.com/pysam-developers/pysam/issues/409 for more details

Python multiprocessing "Bad file descriptor" error (not repeatable)

Apologies in advance, but I am unable to post a fully working example (too much overhead in this code to distill to a runnable snippet). I will post as much explanatory detail as I can, and please do let me know if anything critical seems missing.
Running Python 2.7.5 through IDLE
I am writing a program to compare two text files. Since the files can be large (~500MB) and each row comparison is independent, I would like to implement multiprocessing to speed up the comparison. This is working pretty well, but I am getting stuck on a pseudo-random Bad file descriptor error. I am new to multiprocessing, so I guess there is a technical problem with my implementation. Can anyone point me in the right direction?
Here is the code causing the trouble (specifically the pool.map):
# openfiles
csvReaderTest = csv.reader(open(testpath, 'r'))
csvReaderProd = csv.reader(open(prodpath, 'r'))
compwriter = csv.writer(open(outpath, 'wb'))
pool = Pool()
num_chunks = 3
chunksTest = itertools.groupby(csvReaderTest, keyfunc)
chunksProd = itertools.groupby(csvReaderProd, keyfunc)
while True:
# make a list of num_chunks chunks
groupsTest = [list(chunk) for key, chunk in itertools.islice(chunksTest, num_chunks)]
groupsProd = [list(chunk) for key, chunk in itertools.islice(chunksProd, num_chunks)]
# merge the two lists (pair off comparison rows)
groups_combined = zip(groupsTest,groupsProd)
if groups_combined:
# http://stackoverflow.com/questions/5442910/python-multiprocessing-pool-map-for-multiple-arguments
a_args = groups_combined # a list - set of combinations to be tested
second_arg = True
worker_result = pool.map(worker_mini_star, itertools.izip(itertools.repeat(second_arg),a_args))
Here is the full error output. (This error sometimes occurs, and other times the comparison runs to finish without problems):
Traceback (most recent call last):
File "H:/<PATH_SNIP>/python_csv_compare_multiprocessing_rev02_test2.py", line 407, in <module>
main(fileTest, fileProd, fileout, stringFields, checkFileLengths)
File "H:/<PATH_SNIP>/python_csv_compare_multiprocessing_rev02_test2.py", line 306, in main
worker_result = pool.map(worker_mini_star, itertools.izip(itertools.repeat(second_arg),a_args))
File "C:\Python27\lib\multiprocessing\pool.py", line 250, in map
return self.map_async(func, iterable, chunksize).get()
File "C:\Python27\lib\multiprocessing\pool.py", line 554, in get
raise self._value
IOError: [Errno 9] Bad file descriptor
If it helps, here are the functions called by pool.map:
def worker_mini(flag, chunk):
row_comp = []
for entry, entry2 in zip(chunk[0][0], chunk[1][0]):
if entry == entry2:
temp_comp = entry
else:
temp_comp = '%s|%s' % (entry, entry2)
row_comp.append(temp_comp)
return True, row_comp
#takes a single tuple argument and unpacks the tuple to multiple arguments
def worker_mini_star(flag_chunk):
"""Convert `f([1,2])` to `f(1,2)` call."""
return worker_mini(*flag_chunk)
def main():

Categories

Resources