I have a very very large text file (much larger than can fit in memory). What I would like to do is use something similar to:
for record in myFile:
process_record();
with the added trick that my records are separated by blank lines (with all kinds of stuff in between). For example...
data1
data2,data3,moredata
anotherrecord,otherstuff
yippee
kaiyay
mom
aThird,record:here
How would one iterate through the file in python where each loop iteration accesses a single record from the file?
You can do it with a generator function:
def records(textfile):
record_lines = []
for line in textfile:
if line != '\n':
record_lines.append(line)
else:
yield ''.join(record_lines)
record_lines = []
yield ''.join(record_lines)
for record in records(the_file):
process(record)
You could create an iterator that joins the lines until you find a blank line.
class MyIter:
def __init__(self, infile):
self.infile=infile
def __iter__(self):
return self
def next(self):
lines = []
for line in infile:
line = line.strip()
if len(line) > 0:
lines.append(line)
else:
break
if len(lines)==0:
raise StopIteration
else:
return ",".join(lines)
and try it with
for line in MyIter(infile):
print line
Related
In python it is easy to read and parse a csv file and process line-by-line:
reader = csv.reader(open("my_csv_file.csv"))
for row in reader:
# row is an array or dict
parsed_data = my_data_parser(row)
where my_data_parser is my own piece of logic that takes input data, parses and does logic.
If my parser fails, I would like to log the entire original line of csv file, but it seems that from the csv reader i have no more access to it.
Is it possible to retrieve the original raw line data?
It doesn't seem like the csv.reader() exposes the file object it's iterating, however, you could use the reader's line_num attribute to achieve what you want.
For example:
import csv
file = open("my_csv_file.csv")
lines = file.readlines()
reader = csv.reader(lines)
for row in reader:
# row is an array or dict
try:
parsed_data = my_data_parser(row)
except MyDataParserError:
print(f"ERROR in line number {reader.line_num}")
print("Full line:")
print(lines[reader.line_num])
file.close()
Alternative
If you'd like to avoid always loading the file into memory, you could instead keep your initial way of reading the file and only read the whole file into memory if an error occurred:
import csv
reader = csv.reader(open("my_csv_file.csv"))
for row in reader:
# row is an array or dict
try:
parsed_data = my_data_parser(row)
except MyDataParserError:
# Only read the whole file into memory when an error occurred.
file = open("my_csv_file.csv")
lines = file.readlines()
file.close()
print(f"ERROR in line number {reader.line_num}")
print("Full line:")
print(lines[reader.line_num])
You can access the row line number with
reader.line_num
But there seems to be no direct way to access the actual line (says doc). Here is iterative method that avoids reading the whole file to memory at any step:
import csv
class MyException(Exception):
pass
def super_logic(line): # Some silly logic to get test code running
if len(line) != 2 or line[1] != '1':
raise MyException("Invalid value")
print("Process: %s" % line)
class LastLineReader:
def __init__(self, fn ):
self.fid = open(fn)
def __iter__(self):
return self
def __next__(self):
line = self.fid.readline() # Read single line and cache it local object
if len(line) == 0:
raise StopIteration()
self.current_line = line.strip()
return line
reader_with_lines = LastLineReader( "my_csv_file.csv" )
reader = csv.reader( reader_with_lines )
for line in reader:
try:
super_logic(line)
except MyException as e:
print("Got exception: %s at line '%s'" % ( e, reader_with_lines.current_line ))
(Edited: removed other solutions as they are also visible on other ppl posts)
As alternative to reader.line_num
for index, row in enumerate(reader):
print(i + 1, row)
I'm trying to find a simple way to chain file-like objects. I have a single CSV file which is split into a number of segments on disk. I'd like to be able to pass them to csv.DictReader without having to make a concatenated temporary first.
Something like:
files = map(io.open, filenames)
for row in csv.DictReader(io.chain(files)):
print(row[column_name])
But I haven't been able to find anything like io.chain. If I were parsing it myself, I could do something like:
from itertools import chain
def lines(fp):
for line in fp.readlines():
yield line
a = open('segment-1.dat')
b = open('segment-2.dat')
for line in chain(lines(a), lines(b)):
row = line.strip().split(',')
However DictReader needs something it can call read() on, so this method doesn't work. I can iterate over the files, copying the fieldnames property from the previous reader, but I was hoping for something which let me put all the processing within a single loop body.
An iterable might help
from io import BytesIO
a = BytesIO(b"1st file 1st line \n1st file 2nd line")
b = BytesIO(b"2nd file 1st line \n2nd file 2nd line")
class Reader:
def __init__(self, *files):
self.files = files
self.current_idx = 0
def __iter__(self):
return self
def __next__(self):
f = self.files[self.current_idx]
for line in f:
return line
else:
if self.current_idx < len(self.files) - 1:
self.current_idx += 1
return next (self)
raise StopIteration("feed me more files")
r = Reader(a, b)
for l in r:
print(l)
Result:
b'1st file 1st line \n'
b'1st file 2nd line'
b'2nd file 1st line \n'
b'2nd file 2nd line'
Edit:
:D then there are standard library goodies.
https://docs.python.org/3.7/library/fileinput.html
with fileinput.input(files=('spam.txt', 'eggs.txt')) as f:
for line in f:
process(line)
You could create a class that's an iterator that returns a string each time its __next__() method is called (quoting the docs).
import csv
class ChainedCSVfiles:
def __init__(self, filenames):
self.filenames = filenames
def __iter__(self):
return next(self)
def __next__(self):
for filename in self.filenames:
with open(filename, 'r', newline='') as csvfile:
for line in csvfile:
yield line
filenames = 'segment-1.dat', 'segment-2.dat'
reader = csv.DictReader(ChainedCSVfiles(filenames),
fieldnames=('field1', 'field2', 'field3'))
for row in reader:
print(row)
Can someone help me identify what im doing wrong below:
def SeeIfExactRangeIsFound():
with open(logfile) as input_data:
mylist = []
for line in input_data:
if BeginSearchDVar in line: # Or whatever test is needed
print line.strip()
#mylist.append((line.strip()))
#return mylist
break
for line in input_data: # This keeps reading the file
if line.strip() == EndinSearchD:
break
print line
#mylist.append((line))
#return mylist
#SeeIfExactRangeIsFound()
LRange = SeeIfExactRangeIsFound()
print LRange
I'm looping through a file and only printing out sections of that file. As in, I start printing content of logfile when a specific pattern is found in the line being read. and continue printing all lines after that first line until a line containing the pattern found in EndingSearchD variable is found.
this works with the "print". but as you can see, I want to store the output of the SeeIfExactRangeIsFound function in a variable and use the content of that variable later on.
My problem is, despite my attempts (commented out below) to try different ways to accomplish my goal, none of it seems to work. I feel I'm so close to the answer but I spent 2 hours on this and can't figure it out.
any ideas?
a version matching your description instead of your code
I start printing content of logfile when a specific pattern is found
in the line being read. and continue printing all lines after that
first line until a line containing the pattern found in EndingSearchD
variable is found.
def SeeIfExactRangeIsFound():
with open(logfile) as input_file:
input_data = input_file.readlines()
mylist = []
allow_yielding = False
for line in input_data:
if BeginSearchDVar in line:
allow_yielding = True
if allow_yielding:
yield line
if line.strip() == EndinSearchD:
break
LRange = SeeIfExactRangeIsFound()
print LRange
You almost got it, but your return statement is not in the proper scope:
def SeeIfExactRangeIsFound():
with open(logfile) as input_data:
mylist = []
for line in input_data:
if BeginSearchDVar in line: # Or whatever test is needed
print line.strip()
mylist.append((line.strip()))
break
for line in input_data: # This keeps reading the file
if line.strip() == EndinSearchD:
break
print line
mylist.append((line))
return mylist
as bonus, you can easily transform this into a generator:
def SeeIfExactRangeIsFound():
with open(logfile) as input_data:
for line in input_data:
if BeginSearchDVar in line: # Or whatever test is needed
yield line.strip()
for line in input_data: # This keeps reading the file
if line.strip() == EndinSearchD:
break
yield line
and consume it like:
results = [x for x in def SeeIfExactRangeIsFound()]
I'm having some trouble optimizing this part of code.
It works, but seems unnecessary slow.
The function searches after a searchString in a file starting on line line_nr and returns the line number for first hit.
import linecache
def searchStr(fileName, searchString, line_nr = 1, linesInFile):
# The above string is the input to this function
# line_nr is needed to search after certain lines.
# linesInFile is total number of lines in the file.
while line_nr < linesInFile + 1:
line = linecache.getline(fileName, line_nr)
has_match = line.find(searchString)
if has_match >= 0:
return line_nr
break
line_nr += 1
I've tried something along these lines, but never managed to implement the "start on a certain line number"-input.
Edit: The usecase. I'm post processing analysis files containing text and numbers that are split into different sections with headers. The headers on line_nr are used to break out chunks of the data for further processing.
Example of call:
startOnLine = searchStr(fileName, 'Header 1', 1, 10000000):
endOnLine = searchStr(fileName, 'Header 2', startOnLine, 10000000):
Why don't you start with simplest possible implementation ?
def search_file(filename, target, start_at = 0):
with open(filename) as infile:
for line_no, line in enumerate(infile):
if line_no < start_at:
continue
if line.find(target) >= 0:
return line_no
return None
I guess your file is like:
Header1 data11 data12 data13..
name1 value1 value2 value3...
...
...
Header2 data21 data22 data23..
nameN valueN1 valueN2 valueN3..
...
Does the 'Header' string contains any constant formats(i.e: all start with '#' or sth). If so, you can read the line directly, judge if the line contains this format (i.e: if line[0]=='#') and write different code for different kinds of lines(difination line and data line in your example).
Record class:
class Record:
def __init__(self):
self.data={}
self.header={}
def set_header(self, line):
...
def add_data(self, line):
...
iterate part:
def parse(p_file):
record = None
for line in p_file:
if line[0] == "#":
if record : yield record
else:
record = Record()
record.set_header(line)
else:
record.add_data(line)
yield record
main func:
data_file = open(...)
for rec in parse(data_file):
...
I know how to do it for a TXT file, but now I am having some trouble doing it for a CSV file.
How can I read a CSV file from the bottom in Python?
Pretty much the same way as for a text file: read the whole thing into a list and then go backwards:
import csv
with open('test.csv', 'r') as textfile:
for row in reversed(list(csv.reader(textfile))):
print ', '.join(row)
If you want to get fancy, you could write a lot of code that reads blocks starting at the end of the file and working backwards, emitting a line at a time, and then feed that to csv.reader, but that will only work with a file that can be seeked, i.e. disk files but not standard input.
Some of us have files that do not fit into memory, could anyone come with a solution that does not require storing the entire file in memory?
That's a bit trickier. Luckily, all csv.reader expects is an iterator-like object that returns a string (line) per call to next(). So we grab the technique Darius Bacon presented in "Most efficient way to search the last x lines of a file in python" to read the lines of a file backwards, without having to pull in the whole file:
import os
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
and feed reversed_lines into the code to reverse the lines before they get to csv.reader, removing the need for reversed and list:
import csv
with open('test.csv', 'r') as textfile:
for row in csv.reader(reversed_lines(textfile)):
print ', '.join(row)
There is a more Pythonic solution possible, which doesn't require a character-by-character reversal of the block in memory (hint: just get a list of indices where there are line ends in the block, reverse it, and use it to slice the block), and uses chain out of itertools to glue the line clusters from successive blocks together, but that's left as an exercise for the reader.
It's worth noting that the reversed_lines() idiom above only works if the columns in the CSV file don't contain newlines.
Aargh! There's always something. Luckily, it's not too bad to fix this:
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
quoting = False
for block in reversed_blocks(file):
for c in reversed(block):
if c == '"':
quoting = not quoting
elif c == '\n' and part and not quoting:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
Of course, you'll need to change the quote character if your CSV dialect doesn't use ".
Building on #mike-desimone 's answer. Here's a solution that provides the same structure as a python file object but is read in reverse, line by line:
import os
class ReversedFile(object):
def __init__(self, f, mode='r'):
"""
Wraps a file object with methods that make it be read in reverse line-by-line
if ``f`` is a filename opens a new file object
"""
if mode != 'r':
raise ValueError("ReversedFile only supports read mode (mode='r')")
if not type(f) == file:
# likely a filename
f = open(f)
self.file = f
self.lines = self._reversed_lines()
def _reversed_lines(self):
"Generate the lines of file in reverse order."
part = ''
for block in self._reversed_blocks():
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def _reversed_blocks(self, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file = self.file
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
def __getattribute__(self, name):
"""
Allows for the underlying file attributes to come through
"""
try:
# ReversedFile attribute
return super(ReversedFile, self).__getattribute__(name)
except AttributeError:
# self.file attribute
return getattr(self.file, name)
def __iter__(self):
"""
Creates iterator
"""
return self
def seek(self):
raise NotImplementedError('ReversedFile does not support seek')
def next(self):
"""
Next item in the sequence
"""
return self.lines.next()
def read(self):
"""
Returns the entire contents of the file reversed line by line
"""
contents = ''
for line in self:
contents += line
return contents
def readline(self):
"""
Returns the next line from the bottom
"""
return self.next()
def readlines(self):
"""
Returns all remaining lines from the bottom of the file in reverse
"""
return [x for x in self]
Go for it. This is simple program to reverse the rows from a CSV file.
import csv
BC_file = open('Master.csv', 'rb')
BC_reader = csv.reader(BC_file)
next(BC_reader)
for row in reversed(list(BC_reader)):
print row[0]