How to "chain" iterators? [duplicate] - python

This question already has answers here:
How to join two generators (or other iterables) in Python?
(15 answers)
Closed last month.
I'm trying to chain iterators together with one iterator reading from a master file and another iterator taking each line of the master file and processing another file depending on the output of the first.
The working code that I have is as follows
class MasterReader(object):
def __init__(self, filename):
self.f = open(filename, "r")
def __iter__(self):
return self
def __next__(self):
line = self.f.readline().strip()
if line == "":
raise StopIteration
return line
class SubReader(object):
def __init__(self, mr):
self.mr = mr
def __iter__(self):
self._next()
return self
def _next(self):
self.current = open(self.mr.__next__(), "r")
def __next__(self):
while True:
line = self.current.readline().strip()
if line == "":
self._next()
continue
return line
mr = MasterReader("master")
sr = SubReader(mr)
for line in sr:
print(line)
Where master is a file containing lines of other files
file1
file2
file1 contains
1.a
1.b
1.c
file2 contains
2.a
2.b
2.c
The output is
1.a
1.b
1.c
2.a
2.b
2.c
Again what I have works, but feels wrong in that I have a while loop in __next__ I'm having to manually check for the end of each sub file and explicitly calling the next line in the master file.
Is there a better/more pythonic way of doing this?
EDIT:
This is a simplified problem of what I'm trying to accomplish. In the real version SubReader is going to be threaded and I only want one MasterReader. Actually this won't work for my threading project but want to make sure I'm generalizing iterators before diving deeper into a mess.

You could use itertools.chain.from_iterable with the help of small function yielding the stripped lines from each file.
from itertools import chain
def fgen(fname):
with open(fname) as f:
for line in f:
yield line.strip()
for a in chain.from_iterable(fgen(line) for line in fgen('master.txt')):
print(a)

Since the file object is itself an iterator, you don't necessarily need to implement a __next__ in both cases, just yield lines from it in your __iter__. More so, reading the file with a for loop implicitly handles EOF:
class MasterReader(object):
def __init__(self, filename):
self.f = open(filename)
def __iter__(self):
for line in self.f:
yield line.strip()
self.f.close()
class SubReader(object):
def __init__(self, mr):
self.mr = mr
def __iter__(self):
for filename in mr:
with open(filename) as f:
for line in f:
yield line.strip()

Related

The usual answer not helping with TypeError [duplicate]

This question already has an answer here:
'int' object is not callable in python
(1 answer)
Closed 5 years ago.
I'm getting a TypeError: 'list' object is not callable but I can't see where the usual answer of use list[] instead of list() applies. I don't seem to be using either notation, just calling functions on the list. I'm quite stuck and could use some help here
import scheduler
def main():
sched = scheduler.Scheduler()
sched.line_list("/home/scabandari/Desktop/threads.txt") # Error is caused by this line
# sched.create_processList()
if __name__ == "__main__":
main()
scheduler.py:
import process
class Scheduler:
def __init__(self):
self.line_list = []
self.process_list = []
pass
# populates line_list[] from "/location/.txt" file, each line reps a process object
def line_list(self, file):
f = open(file)
getlines = f.readlines()
for line in getlines:
self.line_list.append(line)
self.line_list.pop(0)
# populates process_list[] from line_list[]
def process_list(self):
for line in self.line_list:
temp_arr = line.split()
self.process_list.append(process.Process(temp_arr[0], temp_arr[1],
temp_arr[2], temp_arr[3]))
for proc in self.process_list:
proc.print_process()
Know that when you define your class object named Scheduler as follows
class Scheduler:
def __init__(self):
self.line_list = []
self.process_list = []
def line_list(self, file):
f = open(file)
getlines = f.readlines()
for line in getlines:
self.line_list.append(line)
self.line_list.pop(0)
...
and then instantiate it, i.e.
inst = Scheduler()
the definition you have done of your method line_list, is overriden by the post-instantiation execution of __init__, which turns it into a list.
Which means that, as melpomene mentions in comments,
you need to decide whether you want line_list to be a method or a list. It can't be both.
Thus, what you may want to do is renaming your method, furthermore giving it a name representative of what it actually does, e.g.
...
def populate_line_list(self, file):
'''
Method which populates line_list from "/location/.txt" file,
each line reps a process object
'''
f = open(file)
getlines = f.readlines()
for line in getlines:
self.line_list.append(line)
self.line_list.pop(0)
f.close() #do not forget to close your file, or use a with-statement
...
Finally you will be able to do
import scheduler
def main():
sched = scheduler.Scheduler()
sched.populate_line_list("/home/scabandari/Desktop/threads.txt")
if __name__ == "__main__":
main()

How to use the same line of code in all functions?

I am newbie in Python.
I wonder if it is possible that all functions inherit the same line of code?
with open(filename, 'r') as f: as this line of code is the same in all three functions. Is it possible to inherit the code without using classes?
I tried to find the answer on stackoverflow and python documentation, but with no luck.
def word_count(filename):
with open(filename, 'r') as f:
return len(f.read().split())
def line_count(filename):
with open(filename, 'r') as f:
return len(f.read().splitlines())
def character_count(filename):
with open(filename, 'r') as f:
return len(f.read())
The common code in your case is
with open(filename, 'r') as f:
contents = f.read()
So just move it to its own function:
def get_file_contents(filename):
with open(filename, 'r') as f:
return f.read()
def word_count(filename):
return len(get_file_contents(filename).split())
def line_count(filename):
return len(get_file_contents(filename).splitlines())
def character_count(filename):
return len(get_file_contents(filename))
What I've done in the past is split the code out into another function, in your example
with open(filename, 'r') as f:
f.read()
Is common within all of your methods, so I'd look at rewriting it like so.
def read_file(filename):
with open(filename, 'r') as f:
return f.read()
def word_count(filename):
return len(read_file(filename).split())
def line_count(filename):
return len(read_file(filename).splitlines())
def character_count(filename):
return len(read_file(filename))
I would use a class:
class Count:
""" Object holds everything count-related """
def __init__(self, filename):
""" specify filename in class instance """
with open(filename, 'r') as f:
self.content = f.read()
def word_count(self):
return len(self.content.split())
def line_count(self):
return len(self.content.splitlines())
def character_count(self):
return len(self.content)
file = Count("whatever.txt")
print(file.word_count())
print(file.line_count())
print(file.character_count())
What you do differently is after you open the file, so if I were in your shoes, I would write a function which takes another function that is executed after the file is opened.
Let's illustrate this in an example:
>>> def operate_file(filename, func):
... with open(filename, 'r') as f:
... return func(f)
>>> def line_count(f):
... return len(f.read().splitlines())
>>> def word_count(f):
... return len(f.read().split())
>>> def character_count(f):
... return len(f.read())
>>> print operate_file('/tmp/file.txt', line_count)
1200
>>> print operate_file('/tmp/file.txt', word_count)
2800
>>> print operate_file('/tmp/file.txt', character_count)
29750
I would recommend decorators. It's sort of like the making the repeated line of code into a function, but since you are going to call that function on each input anyway, decorators can let you just write the functions as id f was the input.
The #open_file is a shorthand for word_count=open_file(word_count).
here is a good place to read more about python decorators.
def open_file(func):
def wrapped_func(filename):
with open(filename, 'r') as f:
return func(f)
return wrapped_func
#open_file
def word_count(f):
return len(f.read().split())
#open_file
def line_count(f):
return len(f.read().splitlines())
#open_file
def character_count(f):
return len(f.read())
It depends on, what you want to do with the results of your 3 functions. Every function is opening the same file. That happens 3 times just to get 3 different properties.
One good solution would be a class. But another would be to rearange your functions to just one. That could return a dictionary or named tuple with the results.
It would look something like this:
def file_count(filename):
with open(filename, 'r') as f:
content = f.read()
properties = {}
properties['words'] = len(content.split())
properties['lines'] = len(content.splitlines())
properties['chars'] = len(content)
return properties

Scope of nested function declarations

I have the following code:
def function_reader(path):
line_no = 0
with open(path, "r") as myfile:
def readline():
line_no +=1
return myfile.readline()
Python keeps returning:
UnboundLocalError: local variable 'line_no' referenced before assignment
when executing line_no +=1.
I understand that the problem is that nested function declarations have weird scoping in python (though I do not understand why it was programmed this way). I'm mostly wondering if there is a simple way to help python resolve the reference, since I really like the functionality this would provide.
Unfortunately, there is not a way to do this in Python 2.x. Nested functions can only read names in the enclosing function, not reassign them.
One workaround would be to make line_no a list and then alter its single item:
def function_reader(path):
line_no = [0]
with open(path, "r") as myfile:
def readline():
line_no[0] += 1
return myfile.readline()
You would then access the line number via line_no[0]. Below is a demonstration:
>>> def outer():
... data = [0]
... def inner():
... data[0] += 1
... inner()
... return data[0]
...
>>> outer()
1
>>>
This solution works because we are not reassigning the name line_no, only mutating the object that it references.
Note that in Python 3.x, this problem would be easily solved using the nonlocal statement:
def function_reader(path):
line_no = 0
with open(path, "r") as myfile:
def readline():
nonlocal line_no
line_no += 1
return myfile.readline()
It's hard to say what you're trying to achieve here by using closures. But the problem is that with this approach either you'll end with an ValueError: I/O operation on closed file when you return readline from the outer function or just the first line if you return readline() from the outer function.
If all you wanted to do is call readline() repeatedly or loop over the file and also remember the current line number then better use a class:
class FileReader(object):
def __init__(self, path):
self.line_no = 0
self.file = open(path)
def __enter__(self):
return self
def __iter__(self):
return self
def next(self):
line = next(self.file)
self.line_no += 1
return line
def readline(self):
return next(self)
def __exit__(self, *args):
self.file.close()
Usage:
with FileReader('file.txt') as f:
print next(f)
print next(f)
print f.readline()
print f.line_no # prints 3
for _ in xrange(3):
print f.readline()
print f.line_no # prints 6
for line in f:
print line
break
print f.line_no # prints 7
The more Pythonic way to get the next line and keep track of the line number is with the enumerate builtin:
with open(path, "r") as my file:
for no, line in enumerate(myfile, start=1):
# process line
This will work in all current Python versions.

Python Custom Iterator: Close a file on StopIteration

I have written an iterator class that opens a file in it's __init__.
def __init__(self, path):
self.file = open(path, "r")
How do I close that file automatically when the iteration is finished?
Complete class:
class Parse(object):
"""A generator that iterates through a CC-CEDICT formatted file, returning
a tuple of parsed results (Traditional, Simplified, Pinyin, English)"""
def __init__(self, path):
self.file = open(path, "r")
def __iter__(self):
return self
def __is_comment(self, line):
return line.startswith("#")
def next(self):
#This block ignores comments.
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
raise StopIteration()
A better way to write the whole thing would be to keep the opening and the iteration in one place:
class Parse(object):
"""A generator that iterates through a CC-CEDICT formatted file, returning
a tuple of parsed results (Traditional, Simplified, Pinyin, English)"""
def __init__(self, path):
self.path = path
def __is_comment(self, line):
return line.startswith("#")
def __iter__(self):
with open(self.path) as f:
for line in f:
if self.__is_comment(line):
continue
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
yield trad, simp, pinyin, english
This will wait to open the file until you really need it, and will automatically close it when done. It's also less code.
If you really want to get into the "generators are awesome!" mindset:
def skip_comments(f):
for line in f:
if not.startswith('#'):
yield line
...
def __iter__(self):
with open(self.path) as f:
for line in skip_comments(f):
working = ....
You need to explicitly close it as soon as StopIteration is raised. In this case, simply call .close() when you raise StopIteration yourself.
def next(self):
#This block ignores comments.
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
self.file.close()
raise StopIteration()
Since no other code in your .next() method could trigger a StopIteration this suffices.
If you did use next() on another iterator inside your own .next() you'd have to catch StopIteration with an except StopIteration: handler and reraise the exception.
This only handles the StopIteration case. If you want to handle other situations (not exhausting the iterator) you'll need to handle that situation separately. Making your class a Context Manager as well could help with that. Users of your iterator would then use the object in a with statement before iterating over it, and when the with suite is exited the file could be closed regardless. You may want to mark your iterator as 'done' as well in that case:
_closed = False
def next(self):
if self._closed:
raise StopIteration
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
self.file.close()
self._closed = True
raise StopIteration()
def __enter__(self):
return self
def __exit__(self, type_, value, tb):
self.file.close() # multiple calls to .close() are fine
self._closed = True

How to break from a Python generator with open file handles

I'm writing a Python generator which looks like "cat". My specific use case is for a "grep like" operation. I want it to be able to break out of the generator if a condition is met:
summary={}
for fn in cat("filelist.dat"):
for line in cat(fn):
if line.startswith("FOO"):
summary[fn] = line
break
So when break happens, I need the cat() generator to finish and close the file handle to fn.
I have to read 100k files with 30 GB of total data, and the FOO keyword happens in the header region, so it is important in this case that the cat() function stops reading the file ASAP.
There are other ways I can solve this problem, but I'm still interested to know how to get an early exit from a generator which has open file handles. Perhaps Python cleans them up right away and closes them when the generator is garbage collected?
Thanks,
Ian
Generators have a close method that raises GeneratorExit at the yield statement. If you specifically catch this exception, you can run some tear-down code:
import contextlib
with contextlib.closing( cat( fn ) ):
...
and then in cat:
try:
...
except GeneratorExit:
# close the file
If you'd like a simpler way to do this (without using the arcane close method on generators), just make cat take a file-like object instead of a string to open, and handle the file IO yourself:
for filename in filenames:
with open( filename ) as theFile:
for line in cat( theFile ):
...
However, you basically don't need to worry about any of this, because the garbage collection will handle it all. Still,
explicit is better than implicit
By implementing the context protocol and the iterator protocol in the same object, you can write pretty sweet code like this:
with cat("/etc/passwd") as lines:
for line in lines:
if "mail" in line:
print line.strip()
break
This is a sample implementation, tested with Python 2.5 on a Linux box. It reads the lines of /etc/passwd until it finds the one for user audio, and then stops:
from __future__ import with_statement
class cat(object):
def __init__(self, fname):
self.fname = fname
def __enter__(self):
print "[Opening file %s]" % (self.fname,)
self.file_obj = open(self.fname, "rt")
return self
def __exit__(self, *exc_info):
print "[Closing file %s]" % (self.fname,)
self.file_obj.close()
def __iter__(self):
return self
def next(self):
line = self.file_obj.next().strip()
print "[Read: %s]" % (line,)
return line
def main():
with cat("/etc/passwd") as lines:
for line in lines:
if "mail" in line:
print line.strip()
break
if __name__ == "__main__":
import sys
sys.exit(main())
Or even simpler:
with open("/etc/passwd", "rt") as f:
for line in f:
if "mail" in line:
break
File objects implement the iterator protocol (see http://docs.python.org/library/stdtypes.html#file-objects)
Please also consider this example:
def itertest():
try:
for i in xrange(1000):
print i
yield i
finally:
print 'finally'
x = itertest()
for i in x:
if i > 2:
break
print 'del x'
del x
print 'exit'
0
1
2
3
del x
finally
exit
It shows that finally is run after the iterator is cleaned up. I think __del__(self) is calling self.close(), see also here: https://docs.python.org/2.7/reference/expressions.html#generator.close
There seems to be another possibility using try..finally (tested on Python 2.7.6):
def gen():
i = 0
try:
while True:
print 'yield %i' % i
yield i
i += 1
print 'will never get here'
finally:
print 'done'
for i in gen():
if i > 1:
print 'break'
break
print i
Gives me the following printout:
yield 0
0
yield 1
1
yield 2
break
done

Categories

Resources