Apologies in advance, but I am unable to post a fully working example (too much overhead in this code to distill to a runnable snippet). I will post as much explanatory detail as I can, and please do let me know if anything critical seems missing.
Running Python 2.7.5 through IDLE
I am writing a program to compare two text files. Since the files can be large (~500MB) and each row comparison is independent, I would like to implement multiprocessing to speed up the comparison. This is working pretty well, but I am getting stuck on a pseudo-random Bad file descriptor error. I am new to multiprocessing, so I guess there is a technical problem with my implementation. Can anyone point me in the right direction?
Here is the code causing the trouble (specifically the pool.map):
# openfiles
csvReaderTest = csv.reader(open(testpath, 'r'))
csvReaderProd = csv.reader(open(prodpath, 'r'))
compwriter = csv.writer(open(outpath, 'wb'))
pool = Pool()
num_chunks = 3
chunksTest = itertools.groupby(csvReaderTest, keyfunc)
chunksProd = itertools.groupby(csvReaderProd, keyfunc)
while True:
# make a list of num_chunks chunks
groupsTest = [list(chunk) for key, chunk in itertools.islice(chunksTest, num_chunks)]
groupsProd = [list(chunk) for key, chunk in itertools.islice(chunksProd, num_chunks)]
# merge the two lists (pair off comparison rows)
groups_combined = zip(groupsTest,groupsProd)
if groups_combined:
# http://stackoverflow.com/questions/5442910/python-multiprocessing-pool-map-for-multiple-arguments
a_args = groups_combined # a list - set of combinations to be tested
second_arg = True
worker_result = pool.map(worker_mini_star, itertools.izip(itertools.repeat(second_arg),a_args))
Here is the full error output. (This error sometimes occurs, and other times the comparison runs to finish without problems):
Traceback (most recent call last):
File "H:/<PATH_SNIP>/python_csv_compare_multiprocessing_rev02_test2.py", line 407, in <module>
main(fileTest, fileProd, fileout, stringFields, checkFileLengths)
File "H:/<PATH_SNIP>/python_csv_compare_multiprocessing_rev02_test2.py", line 306, in main
worker_result = pool.map(worker_mini_star, itertools.izip(itertools.repeat(second_arg),a_args))
File "C:\Python27\lib\multiprocessing\pool.py", line 250, in map
return self.map_async(func, iterable, chunksize).get()
File "C:\Python27\lib\multiprocessing\pool.py", line 554, in get
raise self._value
IOError: [Errno 9] Bad file descriptor
If it helps, here are the functions called by pool.map:
def worker_mini(flag, chunk):
row_comp = []
for entry, entry2 in zip(chunk[0][0], chunk[1][0]):
if entry == entry2:
temp_comp = entry
else:
temp_comp = '%s|%s' % (entry, entry2)
row_comp.append(temp_comp)
return True, row_comp
#takes a single tuple argument and unpacks the tuple to multiple arguments
def worker_mini_star(flag_chunk):
"""Convert `f([1,2])` to `f(1,2)` call."""
return worker_mini(*flag_chunk)
def main():
Related
I am trying to simulate some dna-sequencing reads, and,in order to speed-up the code, I am in need to run it on parallel.
Basically, what I am trying to do is the following:I am sampling reads from the human genome, and I think that one the two process from multiprocessing module try to get data from the same file (the human genome) the processes gets corrupted and it is not able to get the desired DNA sequence. I have tried different things, but I am very new to parallel programming and I cannot solve my problem
When I run the script with one core it works fine.
This is the way I am calling the function
if __name__ == '__main__':
jobs = []
# init the processes
for i in range(number_of_cores):
length= 100
lock = mp.Manager().Lock()
p = mp.Process(target=simulations.sim_reads,args=(lock,FastaFile, "/home/inigo/msc_thesis/genome_data/hg38.fa",length,paired,results_dir,spawn_reads[i],temp_file_names[i]))
jobs.append(p)
p.start()
for p in jobs:
p.join()
And this is the function I am using to get the reads, were each process writes the data to a different file.
def sim_single_end(lc,fastafile,chr,chr_pos_start,chr_pos_end,read_length, unique_id):
lc.acquire()
left_split_read = fastafile.fetch(chr, chr_pos_end - (read_length / 2), chr_pos_end)
right_split_read = fastafile.fetch(chr, chr_pos_start, chr_pos_start + (read_length / 2))
reversed_left_split_read = left_split_read[::-1]
total_read = reversed_left_split_read + right_split_read
seq_id = "id:%s-%s|left_pos:%s-%s|right:%s-%s " % (unique_id,chr, int(chr_pos_end - (read_length / 2)), int(chr_pos_end), int(chr_pos_start),int(chr_pos_start + (read_length / 2)))
quality = "I" * read_length
fastq_string = "#%s\n%s\n+\n%s\n" % (seq_id, total_read, quality)
lc.release()
new_record = SeqIO.read(StringIO(fastq_string), "fastq")
return(new_record)
Here is the traceback:
Traceback (most recent call last):
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/inigo/Dropbox/PycharmProjects/circ_dna/simulations.py", line 107, in sim_ecc_reads
new_read = sim_single_end(lc,fastafile, chr, chr_pos_start, chr_pos_end, read_length, read_id)
File "/home/inigo/Dropbox/PycharmProjects/circ_dna/simulations.py", line 132, in sim_single_end
new_record = SeqIO.read(StringIO(fastq_string), "fastq")
File "/usr/local/lib/python3.5/dist-packages/Bio/SeqIO/__init__.py", line 664, in read
first = next(iterator)
File "/usr/local/lib/python3.5/dist-packages/Bio/SeqIO/__init__.py", line 600, in parse
for r in i:
File "/usr/local/lib/python3.5/dist-packages/Bio/SeqIO/QualityIO.py", line 1031, in FastqPhredIterator
for title_line, seq_string, quality_string in FastqGeneralIterator(handle):
File "/usr/local/lib/python3.5/dist-packages/Bio/SeqIO/QualityIO.py", line 951, in FastqGeneralIterator
% (title_line, seq_len, len(quality_string)))
ValueError: Lengths of sequence and quality values differs for id:6-chr1_KI270707v1_random|left_pos:50511537-50511587|right:50511214-50511264 (0 and 100).
I am the OP of this answer that I did almost a year ago. The problem was that the package that I was using for reading the human genome file (pysam) was failing. The issue was a typo when calling multiprocessing.
From the authors respose, this should work:
p = mp.Process(target=get_fasta, args=(genome_fa,))
note the ',' to ensure you pass a tuple
See https://github.com/pysam-developers/pysam/issues/409 for more details
I'm a bit stumped on this one, and I'm sure it has to due with my lack of experience in Python OOP and how classes work, or how the new string formatting is functioning. Posting here so that others can reference if they have a similar issue.
The problem: I use the following Class as a generator in a loop to read in a file and parse it line by line. Each line of interest increments the self.reads_total variable by one, and if that line meets certain criteria, it increments the self.reads_mapping variable by 1. At the end (just before the StopIteration() call on the next() method, I output those two values to stdout. When I run this from the command line on a file with 52266374 lines, I get the following back:
Reads processed: 5220000029016 with both mates mapped out of 52263262 total reads
This output is generated just before the termination of the iteration by the following line of code:
print("{0} with both mates mapped out of {1} total reads\n".format(self.reads_mapping, self.reads_total))
The self.reads_total is outputting the correct # of iterations, but the self.reads_mapping is not.
Class object that is called by a simple for x in SamParser(infile) loop:
class SamParser:
"""This object takes as input a SAM file path and constructs an iterable that outputs
sequence information. Only one line will be held in memory at a time using this method.
"""
def __init__(self, filepath):
"""
constructor
#param filepath: filepath to the input raw SAM file.
"""
if os.path.exists(filepath): # if file is a file, read from the file
self.sam_file = str(filepath)
self.stdin = False
elif not sys.stdin.isatty(): # else read from standard in
self.stdin = True
else:
raise ValueError("Parameter filepath must be a SAM file")
self.current_line = None
self.reads_mapping = 0
self.reads_total = 0
# Allowable bitflags for SAM file -> reads with both mates mapping, regardless of other flags
self.true_flags = (99, 147, 83, 163, 67, 131, 115, 179, 81, 161, 97, 145, 65, 129, 113, 177)
def __iter__(self):
return self
def _iterate(self):
# Skip all leading whitespace
while True:
if self.stdin:
sam_line = sys.stdin.readline() # read from stdin
else:
sam_line = self.sam_file.readline() # read from file
if not sam_line:
return # End of file
if sam_line.startswith("#SQ"): # these lines contain refseq length information
temp = sam_line.split()
return temp[1][3:], temp[2][3:]
elif sam_line[0] != "#": # these lines are the actual reads
self.reads_total += 1
if self.reads_total % 100000 == 0: # update the counter on stdout every 100000 reads
sys.stdout.write("\rReads processed: {}".format(self.reads_total))
sys.stdout.flush()
temp = sam_line.split()
if int(temp[1]) in self.true_flags and temp[2] is not "*" and int(temp[3]) is not 0:
self.reads_mapping += 1
return temp[1], temp[2], temp[3], temp[9]
self.sam_file.close() # catch all in case this line is reached
assert False, "Should not reach this line"
def next(self):
if not self.stdin and type(self.sam_file) is str: # only open file here if sam_file is a str and not file
self.sam_file = open(self.sam_file, "r")
value = self._iterate()
if not value: # close file on EOF
if not self.stdin:
self.sam_file.close()
print("{0} with both mates mapped out of {1} total reads\n".format(self.reads_mapping, self.reads_total))
raise StopIteration()
else:
return value
The full script can be found here if you need more context: https://github.com/lakinsm/file-parsing/blob/master/amr_skewness.py
Simple string formatting mistake: the script is designed to update the line count in place on stdout, and I didn't include a newline at the beginning of the next line, so it was overwriting the last line output, which included the large read count from the previous total. The true output should be:
Reads processed: 52200000
29016 with both mates mapped out of 52263262 total reads
Plots generated: 310
In any case, the script is useful for SAM parsing if anyone stumbles upon this in the future. Don't make my same mistake.
I'm trying to use whisper-merge to merge 2 wsp files. They have identical retention strategies, one just has older data than the other.
When I run whisper-merge oldfile.wsp newfile.wsp I get this error
Traceback (most recent call last):
File "/usr/local/src/whisper-0.9.12/bin/whisper-merge.py", line 32, in <module>
whisper.merge(path_from, path_to)
File "/usr/local/lib/python2.7/dist-packages/whisper.py", line 821, in merge
(timeInfo, values) = fetch(path_from, fromTime, untilTime)
TypeError: 'NoneType' object is not iterable
Any ideas?
Here's the meta data output for the 2 files:
"from_file" http://sprunge.us/dBHC
"to_file" http://sprunge.us/eIVG
Line 812 in whisper.py is broken for files that contain multiple archives.
https://github.com/graphite-project/whisper/blob/0.9.12/whisper.py#L812
fromTime = int(time.time()) - headerFrom['maxRetention']
To fix, immediately following line 813, assign fromTime based on the archive retention.
https://github.com/graphite-project/whisper/blob/0.9.12/whisper.py#L813
for archive in archives: # this line already exists
fromTime = int(time.time()) - archive['retention'] # add this line
Snippet from whisper.py
def fetch(path,fromTime,untilTime=None):
"""fetch(path,fromTime,untilTime=None)
path is a string
fromTime is an epoch time
untilTime is also an epoch time, but defaults to now.
Returns a tuple of (timeInfo, valueList)
where timeInfo is itself a tuple of (fromTime, untilTime, step)
Returns None if no data can be returned
"""
fh = open(path,'rb')
return file_fetch(fh, fromTime, untilTime)
Suggests that whisper.fetch() is returning None, which in turn, (along with the final line in the traceback) suggests that there is a problem with your path_from file.
Looking a little deeper, whisper.file_fetch() appears to have two places where it can return None (explicitly, at least):
def file_fetch(fh, fromTime, untilTime):
header = __readHeader(fh)
now = int( time.time() )
if untilTime is None:
untilTime = now
fromTime = int(fromTime)
untilTime = int(untilTime)
# Here we try and be flexible and return as much data as we can.
# If the range of data is from too far in the past or fully in the future, we
# return nothing
if (fromTime > untilTime):
raise InvalidTimeInterval("Invalid time interval: from time '%s' is after until time '%s'" % (fromTime, untilTime))
oldestTime = now - header['maxRetention']
# Range is in the future
if fromTime > now:
return None # <== Here
# Range is beyond retention
if untilTime < oldestTime:
return None # <== ...and here
...
I have a large list of documents to upsert into MongoDB (possibly n > 100000). I don't want to create 100000 deferreds all at once, but I don't want to execute and wait for each query sequentially because I have a connection pool to MongoDB and I want to utilize it fully. So I have a generator function that will yield deferreds to be consumed by a DeferredLazyList.
def generate_update_deferreds(collection, many_docs):
for doc in many_docs:
d = collection.update({'_id': doc['_id']}, doc, upsert=True)
yield d
This is the code linking the generation of the deferred upserts, and the DeferredLazyList.
#defer.inlineCallbacks
def update_docs(collection, many_docs):
gen_deferreds = generate_update_deferreds(collection, many_docs)
results = yield DeferredLazyList(gen_deferreds, count=pool_size, consume_errors=True)
The DeferredLazyList is similar to DeferredList, but instead of accepting a list of deferreds to wait for it accepts an iterator. The deferreds are retrieved from the iterator while only having count deferreds active simultaneously. This is used to effectively batch deferreds because they are created as they are yielded.
class DeferredLazyList(defer.Deferred):
"""
The ``DeferredLazyList`` class is used for collecting the results of
many deferreds. This is similar to ``DeferredList``
(``twisted.internet.defer.DeferredList``) but works with an iterator
yielding deferreds. This will only maintain a certain number of
deferreds simultaneously. Once one of the deferreds finishes, another
will be obtained from the iterator.
"""
def __init__(self, deferreds, count=None, consume_errors=None):
defer.Deferred.__init__(self)
if count is None:
count = 1
self.__consume_errors = bool(consume_errors)
self.__iter = enumerate(deferreds)
self.__results = []
for _i in xrange(count):
# Start specified number of simultaneous deferreds.
if not self.called:
self.__next_save_result(None, None, None)
else:
break
def __next_save_result(self, result, success, index):
"""
Called when a deferred completes.
"""
# Make sure we can save result at index.
if index is not None:
results_len = len(self.__results)
if results_len <= index:
self.__results += [NO_RESULT] * (index - results_len + 1)
# Save result.
self.__results[index] = (success, result)
# Get next deferred.
try:
i, d = self.__iter.next()
d.addCallbacks(self.__next_save_result, self.__next_save_result, callbackArgs=(True, i), errbackArgs=(False, i))
except StopIteration:
# Iterator is exhausted, callback self with results.
self.callback(self.__results)
# Pass through result.
return result if success or not self.__consume_errors else None
The problem is when the deferreds are yielded from generate_update_deferreds() their .called is already set to True which is causing DeferredLazyList to recursively call itself.
What's happening is:
In DeferredLazyList.__init__(), self.__next_save_result() is called count times (say 5).
Each call to self.__next_save_result() consumes 1 deferred from self.__iter, and itself is added as a callback.
Because the yielded deferred has .called set to True, d.addCallbacks(self.__next_save_result, ...) immediately calls self.__next_save_result() and this loop continues until a RuntimeError is raised because recursion depth has been reached.
I've printed a stacktrace before the recursion limit was reached to confirm that this is the cause of the problem:
File "/home/caleb/it/Development/projects/python/amazon/bin/feeds-daemon/lib/server.py", line 937, in update_many_docs
results = yield DeferredLazyList(gen_deferreds, count=self.mongo_connections, consume_errors=True, return_results=True)
File "/home/caleb/it/Development/projects/python/amazon/bin/feeds-daemon/lib/twisted.py", line 157, in __init__
self.__next_save_result(None, None, None)
File "/home/caleb/it/Development/projects/python/amazon/bin/feeds-daemon/lib/twisted.py", line 222, in __next_save_result
d.addCallbacks(self.__next_save_result, self.__next_save_result, callbackArgs=(True, i), errbackArgs=(False, i))
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 290, in addCallbacks
self._runCallbacks()
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 551, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/caleb/it/Development/projects/python/amazon/bin/feeds-daemon/lib/twisted.py", line 222, in __next_save_result
d.addCallbacks(self.__next_save_result, self.__next_save_result, callbackArgs=(True, i), errbackArgs=(False, i))
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 290, in addCallbacks
self._runCallbacks()
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 551, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/caleb/it/Development/projects/python/amazon/bin/feeds-daemon/lib/twisted.py", line 222, in __next_save_result
d.addCallbacks(self.__next_save_result, self.__next_save_result, callbackArgs=(True, i), errbackArgs=(False, i))
# Repeated until the RuntimeError
exceptions.RuntimeError: maximum recursion depth exceeded
Any help would be greatly appreciated. By the way, I am running Python 2.7.3 with Twisted 12.1.0 and the MongoDB stuff is really only relevant to understand the context.
I wanted the result from each deferred, but cooperate() doesn't return those so I added a callback to each deferred before yielding them to the CooperativeTasks:
from twisted.internet.defer import DeferredList, inlineCallbacks
from twisted.internet.task import cooperate
NO_RESULT = object()
def generate_update_deferreds(collection, many_docs, save_results):
for i, doc in enumerate(update_docs):
d = collection.update({'_id': doc['_id']}, doc, upsert=True)
d.addBoth(save_result, i, save_results) # Save result
yield d
def save_result(result, i, save_results):
save_results[i] = result
#inlineCallbacks
def update_docs(collection, many_docs):
save_results = [NO_RESULT] * len(many_docs)
gen_deferreds = generate_update_deferreds(collection, many_docs, save_results))
workers = [cooperate(gen_deferreds).whenDone() for _i in xrange(count)]
yield defer.DeferredList(workers)
# Handle save_results...
There are some tools in Twisted that will help you do this more easily. For example, cooperate:
from twisted.internet.task import cooperate
def generate_update_deferreds(collection, many_docs):
for doc in update_docs:
d = collection.update({'_id': doc['_id']}, doc, upsert=True)
yield d
work = generate_update_deferreds(...)
worker_tasks = []
for i in range(count):
task = cooperate(work)
worker_tasks.append(task)
all_done_deferred = DeferredList([task.whenDone() for task in worker_tasks])
I'm playing with contract.py, Terrence Way's reference implementation of design-by-contract for Python. The implementation throws an exception when a contract (precondition/postcondition/invariant) is violated, but it doesn't provide you a quick way of identifying which specific contract has failed if there are multiple ones associated with a method.
For example, if I take the circbuf.py example, and violate the precondition by passing in a negative argument, like so:
circbuf(-5)
Then I get a traceback that looks like this:
Traceback (most recent call last):
File "circbuf.py", line 115, in <module>
circbuf(-5)
File "<string>", line 3, in __assert_circbuf___init___chk
File "build/bdist.macosx-10.5-i386/egg/contract.py", line 1204, in call_constructor_all
File "build/bdist.macosx-10.5-i386/egg/contract.py", line 1293, in _method_call_all
File "build/bdist.macosx-10.5-i386/egg/contract.py", line 1332, in _call_all
File "build/bdist.macosx-10.5-i386/egg/contract.py", line 1371, in _check_preconditions
contract.PreconditionViolationError: ('__main__.circbuf.__init__', 4)
My hunch is that the second argument in the PreconditionViolationError (4) refers to the line number in the circbuf.init docstring that contains the assertion:
def __init__(self, leng):
"""Construct an empty circular buffer.
pre::
leng > 0
post[self]::
self.is_empty() and len(self.buf) == leng
"""
However, it's a pain to have to open the file and count the docstring line numbers. Does anybody have a quicker solution for identifying which contract has failed?
(Note that in this example, there's a single precondition, so it's obvious, but multiple preconditions are possible).
This is an old question but I may as well answer it. I added some output, you'll see it at the comment # jlr001. Add the line below to your contract.py and when it raises an exception it will show the doc line number and the statement that triggered it. Nothing more than that, but it will at least stop you from needing to guess which condition triggered it.
def _define_checker(name, args, contract, path):
"""Define a function that does contract assertion checking.
args is a string argument declaration (ex: 'a, b, c = 1, *va, **ka')
contract is an element of the contracts list returned by parse_docstring
module is the containing module (not parent class)
Returns the newly-defined function.
pre::
isstring(name)
isstring(args)
contract[0] in _CONTRACTS
len(contract[2]) > 0
post::
isinstance(__return__, FunctionType)
__return__.__name__ == name
"""
output = StringIO()
output.write('def %s(%s):\n' % (name, args))
# ttw001... raise new exception classes
ex = _EXCEPTIONS.get(contract[0], 'ContractViolationError')
output.write('\tfrom %s import forall, exists, implies, %s\n' % \
(MODULE, ex))
loc = '.'.join([x.__name__ for x in path])
for c in contract[2]:
output.write('\tif not (')
output.write(c[0])
# jlr001: adding conidition statement to output message, easier debugging
output.write('): raise %s("%s", %u, "%s")\n' % (ex, loc, c[1], c[0]))
# ...ttw001
# ttw016: return True for superclasses to use in preconditions
output.write('\treturn True')
# ...ttw016
return _define(name, output.getvalue(), path[0])
Without modifying his code, I don't think you can, but since this is python...
If you look for where he raises the exception to the user, it I think is possible to push the info you're looking for into it... I wouldn't expect you to be able to get the trace-back to be any better though because the code is actually contained in a comment block and then processed.
The code is pretty complicated, but this might be a block to look at - maybe if you dump out some of the args you can figure out whats going on...
def _check_preconditions(a, func, va, ka):
# ttw006: correctly weaken pre-conditions...
# ab002: Avoid generating AttributeError exceptions...
if hasattr(func, '__assert_pre'):
try:
func.__assert_pre(*va, **ka)
except PreconditionViolationError, args:
# if the pre-conditions fail, *all* super-preconditions
# must fail too, otherwise
for f in a:
if f is not func and hasattr(f, '__assert_pre'):
f.__assert_pre(*va, **ka)
raise InvalidPreconditionError(args)
# rr001: raise original PreconditionViolationError, not
# inner AttributeError...
# raise
raise args
# ...rr001
# ...ab002
# ...ttw006