Python Custom Iterator: Close a file on StopIteration - python

I have written an iterator class that opens a file in it's __init__.
def __init__(self, path):
self.file = open(path, "r")
How do I close that file automatically when the iteration is finished?
Complete class:
class Parse(object):
"""A generator that iterates through a CC-CEDICT formatted file, returning
a tuple of parsed results (Traditional, Simplified, Pinyin, English)"""
def __init__(self, path):
self.file = open(path, "r")
def __iter__(self):
return self
def __is_comment(self, line):
return line.startswith("#")
def next(self):
#This block ignores comments.
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
raise StopIteration()

A better way to write the whole thing would be to keep the opening and the iteration in one place:
class Parse(object):
"""A generator that iterates through a CC-CEDICT formatted file, returning
a tuple of parsed results (Traditional, Simplified, Pinyin, English)"""
def __init__(self, path):
self.path = path
def __is_comment(self, line):
return line.startswith("#")
def __iter__(self):
with open(self.path) as f:
for line in f:
if self.__is_comment(line):
continue
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
yield trad, simp, pinyin, english
This will wait to open the file until you really need it, and will automatically close it when done. It's also less code.
If you really want to get into the "generators are awesome!" mindset:
def skip_comments(f):
for line in f:
if not.startswith('#'):
yield line
...
def __iter__(self):
with open(self.path) as f:
for line in skip_comments(f):
working = ....

You need to explicitly close it as soon as StopIteration is raised. In this case, simply call .close() when you raise StopIteration yourself.
def next(self):
#This block ignores comments.
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
self.file.close()
raise StopIteration()
Since no other code in your .next() method could trigger a StopIteration this suffices.
If you did use next() on another iterator inside your own .next() you'd have to catch StopIteration with an except StopIteration: handler and reraise the exception.
This only handles the StopIteration case. If you want to handle other situations (not exhausting the iterator) you'll need to handle that situation separately. Making your class a Context Manager as well could help with that. Users of your iterator would then use the object in a with statement before iterating over it, and when the with suite is exited the file could be closed regardless. You may want to mark your iterator as 'done' as well in that case:
_closed = False
def next(self):
if self._closed:
raise StopIteration
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
self.file.close()
self._closed = True
raise StopIteration()
def __enter__(self):
return self
def __exit__(self, type_, value, tb):
self.file.close() # multiple calls to .close() are fine
self._closed = True

Related

How to "chain" iterators? [duplicate]

This question already has answers here:
How to join two generators (or other iterables) in Python?
(15 answers)
Closed last month.
I'm trying to chain iterators together with one iterator reading from a master file and another iterator taking each line of the master file and processing another file depending on the output of the first.
The working code that I have is as follows
class MasterReader(object):
def __init__(self, filename):
self.f = open(filename, "r")
def __iter__(self):
return self
def __next__(self):
line = self.f.readline().strip()
if line == "":
raise StopIteration
return line
class SubReader(object):
def __init__(self, mr):
self.mr = mr
def __iter__(self):
self._next()
return self
def _next(self):
self.current = open(self.mr.__next__(), "r")
def __next__(self):
while True:
line = self.current.readline().strip()
if line == "":
self._next()
continue
return line
mr = MasterReader("master")
sr = SubReader(mr)
for line in sr:
print(line)
Where master is a file containing lines of other files
file1
file2
file1 contains
1.a
1.b
1.c
file2 contains
2.a
2.b
2.c
The output is
1.a
1.b
1.c
2.a
2.b
2.c
Again what I have works, but feels wrong in that I have a while loop in __next__ I'm having to manually check for the end of each sub file and explicitly calling the next line in the master file.
Is there a better/more pythonic way of doing this?
EDIT:
This is a simplified problem of what I'm trying to accomplish. In the real version SubReader is going to be threaded and I only want one MasterReader. Actually this won't work for my threading project but want to make sure I'm generalizing iterators before diving deeper into a mess.
You could use itertools.chain.from_iterable with the help of small function yielding the stripped lines from each file.
from itertools import chain
def fgen(fname):
with open(fname) as f:
for line in f:
yield line.strip()
for a in chain.from_iterable(fgen(line) for line in fgen('master.txt')):
print(a)
Since the file object is itself an iterator, you don't necessarily need to implement a __next__ in both cases, just yield lines from it in your __iter__. More so, reading the file with a for loop implicitly handles EOF:
class MasterReader(object):
def __init__(self, filename):
self.f = open(filename)
def __iter__(self):
for line in self.f:
yield line.strip()
self.f.close()
class SubReader(object):
def __init__(self, mr):
self.mr = mr
def __iter__(self):
for filename in mr:
with open(filename) as f:
for line in f:
yield line.strip()

Iterator example from Dive Into Python 3

I'm learning Python as my 1st language from http://www.diveintopython3.net/. On Chp 7, http://www.diveintopython3.net/iterators.html, there is an example of how to use an iterator.
import re
def build_match_and_apply_functions(pattern, search, replace):
def matches_rule(word):
return re.search(pattern, word)
def apply_rule(word):
return re.sub(search, replace, word)
return [matches_rule, apply_rule]
class LazyRules:
rules_filename = 'plural6-rules.txt'
def __init__(self):
self.pattern_file = open(self.rules_filename, encoding='utf-8')
self.cache = []
def __iter__(self):
self.cache_index = 0
return self
def __next__(self):
self.cache_index += 1
if len(self.cache) >= self.cache_index:
return self.cache[self.cache_index - 1]
if self.pattern_file.closed:
raise StopIteration
line = self.pattern_file.readline()
if not line:
self.pattern_file.close()
raise StopIteration
pattern, search, replace = line.split(None, 3)
funcs = build_match_and_apply_functions(
pattern, search, replace)
self.cache.append(funcs)
return funcs
rules = LazyRules()
def plural(noun):
for matches_rule, apply_rule in rules:
if matches_rule(noun):
return apply_rule(noun)
if __name__ == '__main__':
import sys
if sys.argv[1:]:
print(plural(sys.argv[1]))
else:
print(__doc__)
My question is: how does the 'for matches_rule, apply_rule in rules:' loop in the plural(noun) function know when to exit after fulfilling the if condition? There are no StopIteration commands for that condition. I would expect the for loop to continue until the rules.cache is iterated completely.
Thank you for the help!
The return statement ends the function at that point, returning a value to the caller. This can be relied upon in almost any situation (if you have a try..except..else..finally structure, even a return statement won't prevent the finally block from being executed).

Scope of nested function declarations

I have the following code:
def function_reader(path):
line_no = 0
with open(path, "r") as myfile:
def readline():
line_no +=1
return myfile.readline()
Python keeps returning:
UnboundLocalError: local variable 'line_no' referenced before assignment
when executing line_no +=1.
I understand that the problem is that nested function declarations have weird scoping in python (though I do not understand why it was programmed this way). I'm mostly wondering if there is a simple way to help python resolve the reference, since I really like the functionality this would provide.
Unfortunately, there is not a way to do this in Python 2.x. Nested functions can only read names in the enclosing function, not reassign them.
One workaround would be to make line_no a list and then alter its single item:
def function_reader(path):
line_no = [0]
with open(path, "r") as myfile:
def readline():
line_no[0] += 1
return myfile.readline()
You would then access the line number via line_no[0]. Below is a demonstration:
>>> def outer():
... data = [0]
... def inner():
... data[0] += 1
... inner()
... return data[0]
...
>>> outer()
1
>>>
This solution works because we are not reassigning the name line_no, only mutating the object that it references.
Note that in Python 3.x, this problem would be easily solved using the nonlocal statement:
def function_reader(path):
line_no = 0
with open(path, "r") as myfile:
def readline():
nonlocal line_no
line_no += 1
return myfile.readline()
It's hard to say what you're trying to achieve here by using closures. But the problem is that with this approach either you'll end with an ValueError: I/O operation on closed file when you return readline from the outer function or just the first line if you return readline() from the outer function.
If all you wanted to do is call readline() repeatedly or loop over the file and also remember the current line number then better use a class:
class FileReader(object):
def __init__(self, path):
self.line_no = 0
self.file = open(path)
def __enter__(self):
return self
def __iter__(self):
return self
def next(self):
line = next(self.file)
self.line_no += 1
return line
def readline(self):
return next(self)
def __exit__(self, *args):
self.file.close()
Usage:
with FileReader('file.txt') as f:
print next(f)
print next(f)
print f.readline()
print f.line_no # prints 3
for _ in xrange(3):
print f.readline()
print f.line_no # prints 6
for line in f:
print line
break
print f.line_no # prints 7
The more Pythonic way to get the next line and keep track of the line number is with the enumerate builtin:
with open(path, "r") as my file:
for no, line in enumerate(myfile, start=1):
# process line
This will work in all current Python versions.

How to break from a Python generator with open file handles

I'm writing a Python generator which looks like "cat". My specific use case is for a "grep like" operation. I want it to be able to break out of the generator if a condition is met:
summary={}
for fn in cat("filelist.dat"):
for line in cat(fn):
if line.startswith("FOO"):
summary[fn] = line
break
So when break happens, I need the cat() generator to finish and close the file handle to fn.
I have to read 100k files with 30 GB of total data, and the FOO keyword happens in the header region, so it is important in this case that the cat() function stops reading the file ASAP.
There are other ways I can solve this problem, but I'm still interested to know how to get an early exit from a generator which has open file handles. Perhaps Python cleans them up right away and closes them when the generator is garbage collected?
Thanks,
Ian
Generators have a close method that raises GeneratorExit at the yield statement. If you specifically catch this exception, you can run some tear-down code:
import contextlib
with contextlib.closing( cat( fn ) ):
...
and then in cat:
try:
...
except GeneratorExit:
# close the file
If you'd like a simpler way to do this (without using the arcane close method on generators), just make cat take a file-like object instead of a string to open, and handle the file IO yourself:
for filename in filenames:
with open( filename ) as theFile:
for line in cat( theFile ):
...
However, you basically don't need to worry about any of this, because the garbage collection will handle it all. Still,
explicit is better than implicit
By implementing the context protocol and the iterator protocol in the same object, you can write pretty sweet code like this:
with cat("/etc/passwd") as lines:
for line in lines:
if "mail" in line:
print line.strip()
break
This is a sample implementation, tested with Python 2.5 on a Linux box. It reads the lines of /etc/passwd until it finds the one for user audio, and then stops:
from __future__ import with_statement
class cat(object):
def __init__(self, fname):
self.fname = fname
def __enter__(self):
print "[Opening file %s]" % (self.fname,)
self.file_obj = open(self.fname, "rt")
return self
def __exit__(self, *exc_info):
print "[Closing file %s]" % (self.fname,)
self.file_obj.close()
def __iter__(self):
return self
def next(self):
line = self.file_obj.next().strip()
print "[Read: %s]" % (line,)
return line
def main():
with cat("/etc/passwd") as lines:
for line in lines:
if "mail" in line:
print line.strip()
break
if __name__ == "__main__":
import sys
sys.exit(main())
Or even simpler:
with open("/etc/passwd", "rt") as f:
for line in f:
if "mail" in line:
break
File objects implement the iterator protocol (see http://docs.python.org/library/stdtypes.html#file-objects)
Please also consider this example:
def itertest():
try:
for i in xrange(1000):
print i
yield i
finally:
print 'finally'
x = itertest()
for i in x:
if i > 2:
break
print 'del x'
del x
print 'exit'
0
1
2
3
del x
finally
exit
It shows that finally is run after the iterator is cleaned up. I think __del__(self) is calling self.close(), see also here: https://docs.python.org/2.7/reference/expressions.html#generator.close
There seems to be another possibility using try..finally (tested on Python 2.7.6):
def gen():
i = 0
try:
while True:
print 'yield %i' % i
yield i
i += 1
print 'will never get here'
finally:
print 'done'
for i in gen():
if i > 1:
print 'break'
break
print i
Gives me the following printout:
yield 0
0
yield 1
1
yield 2
break
done

Utility that helps in file locking - expert tips wanted

I've written a subclass of file that a) provides methods to conveniently lock it (using fcntl, so it only supports unix, which is however OK for me atm) and b) when reading or writing asserts that the file is appropriately locked.
Now I'm not an expert at such stuff (I've just read one paper [de] about it) and would appreciate some feedback: Is it secure, are there race conditions, are there other things that could be done better … Here is the code:
from fcntl import flock, LOCK_EX, LOCK_SH, LOCK_UN, LOCK_NB
class LockedFile(file):
"""
A wrapper around `file` providing locking. Requires a shared lock to read
and a exclusive lock to write.
Main differences:
* Additional methods: lock_ex, lock_sh, unlock
* Refuse to read when not locked, refuse to write when not locked
exclusivly.
* mode cannot be `w` since then the file would be truncated before
it could be locked.
You have to lock the file yourself, it won't be done for you implicitly.
Only you know what lock you need.
Example usage::
def get_config():
f = LockedFile(CONFIG_FILENAME, 'r')
f.lock_sh()
config = parse_ini(f.read())
f.close()
def set_config(key, value):
f = LockedFile(CONFIG_FILENAME, 'r+')
f.lock_ex()
config = parse_ini(f.read())
config[key] = value
f.truncate()
f.write(make_ini(config))
f.close()
"""
def __init__(self, name, mode='r', *args, **kwargs):
if 'w' in mode:
raise ValueError('Cannot open file in `w` mode')
super(LockedFile, self).__init__(name, mode, *args, **kwargs)
self.locked = None
def lock_sh(self, **kwargs):
"""
Acquire a shared lock on the file. If the file is already locked
exclusively, do nothing.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
:param nonblocking: Don't wait for the lock to be available.
"""
if self.locked == 'ex':
return # would implicitly remove the exclusive lock
return self._lock(LOCK_SH, **kwargs)
def lock_ex(self, **kwargs):
"""
Acquire an exclusive lock on the file.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
:param nonblocking: Don't wait for the lock to be available.
"""
return self._lock(LOCK_EX, **kwargs)
def unlock(self):
"""
Release all locks on the file.
Flushes if there was an exclusive lock.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
"""
if self.locked == 'ex':
self.flush()
return self._lock(LOCK_UN)
def _lock(self, mode, nonblocking=False):
flock(self, mode | bool(nonblocking) * LOCK_NB)
before = self.locked
self.locked = {LOCK_SH: 'sh', LOCK_EX: 'ex', LOCK_UN: None}[mode]
return before
def _assert_read_lock(self):
assert self.locked, "File is not locked"
def _assert_write_lock(self):
assert self.locked == 'ex', "File is not locked exclusively"
def read(self, *args):
self._assert_read_lock()
return super(LockedFile, self).read(*args)
def readline(self, *args):
self._assert_read_lock()
return super(LockedFile, self).readline(*args)
def readlines(self, *args):
self._assert_read_lock()
return super(LockedFile, self).readlines(*args)
def xreadlines(self, *args):
self._assert_read_lock()
return super(LockedFile, self).xreadlines(*args)
def __iter__(self):
self._assert_read_lock()
return super(LockedFile, self).__iter__()
def next(self):
self._assert_read_lock()
return super(LockedFile, self).next()
def write(self, *args):
self._assert_write_lock()
return super(LockedFile, self).write(*args)
def writelines(self, *args):
self._assert_write_lock()
return super(LockedFile, self).writelines(*args)
def flush(self):
self._assert_write_lock()
return super(LockedFile, self).flush()
def truncate(self, *args):
self._assert_write_lock()
return super(LockedFile, self).truncate(*args)
def close(self):
self.unlock()
return super(LockedFile, self).close()
(the example in the docstring is also my current use case for this)
Thanks for having read until down here, and possibly even answering :)
I am also not an expert, but there is one thing you should change, and a couple others to consider:
First off, using assert this way is a bad idea: if python is run with -O or -OO asserts are turned off and your two assert_*_lock() methods would always return True.
Second -- you need some tests. :) I took the liberty of added a custom error class and writing a couple tests. The first four pass, the last fails; which brings up the question, what should happen if the file is opened normally (as some other non-LockedFile object) and data is written to it?
Oh, and lastly -- the name LockableFile makes more sense to me, since the file can be in an unlocked state.
Here are the changes I made:
class LockedFileError(OSError): # might want IOError instead
pass
if __name__ == '__main__':
import unittest
import tempfile
import shutil
import os
class TestLockedFile(unittest.TestCase):
def setUp(self):
self.dir = tempfile.mkdtemp()
self.testfile = testfile = os.path.join(self.dir, 'opened.txt')
temp = open(testfile, 'w')
temp.write('[global]\nsetting1=99\nsetting2=42\n')
temp.close()
def tearDown(self):
shutil.rmtree(self.dir, ignore_errors=True)
def test_01(self):
"writes fail if not locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r+')
self.assertRaises(LockedFileError, temp.write, 'arbitrary data')
temp.lock_sh()
self.assertRaises(LockedFileError, temp.write, 'arbitrary data')
def test_02(self):
"reads fail if not locked"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
self.assertRaises(LockedFileError, temp.read)
def test_03(self):
"writes succeed if locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r+')
temp.lock_ex()
temp.write('arbitrary data\n')
def test_04(self):
"reads succeed if locked"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
temp.lock_sh()
temp.readline()
temp.lock_ex()
temp.readline()
def test_05(self):
"other writes fail if locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
temp.lock_ex()
testing = open(testfile, 'r+')
# not sure if this should be OSError, IOError, or something else...
self.assertRaises(OSError, testing.write, 'this should fail\n')
unittest.main()
Many more tests should be written to cover the various combinations of a LockedFile with reads, writes, and other non-LockedFile file objects trying to read/write to the same actual file.

Categories

Resources