I'm trying to find a simple way to chain file-like objects. I have a single CSV file which is split into a number of segments on disk. I'd like to be able to pass them to csv.DictReader without having to make a concatenated temporary first.
Something like:
files = map(io.open, filenames)
for row in csv.DictReader(io.chain(files)):
print(row[column_name])
But I haven't been able to find anything like io.chain. If I were parsing it myself, I could do something like:
from itertools import chain
def lines(fp):
for line in fp.readlines():
yield line
a = open('segment-1.dat')
b = open('segment-2.dat')
for line in chain(lines(a), lines(b)):
row = line.strip().split(',')
However DictReader needs something it can call read() on, so this method doesn't work. I can iterate over the files, copying the fieldnames property from the previous reader, but I was hoping for something which let me put all the processing within a single loop body.
An iterable might help
from io import BytesIO
a = BytesIO(b"1st file 1st line \n1st file 2nd line")
b = BytesIO(b"2nd file 1st line \n2nd file 2nd line")
class Reader:
def __init__(self, *files):
self.files = files
self.current_idx = 0
def __iter__(self):
return self
def __next__(self):
f = self.files[self.current_idx]
for line in f:
return line
else:
if self.current_idx < len(self.files) - 1:
self.current_idx += 1
return next (self)
raise StopIteration("feed me more files")
r = Reader(a, b)
for l in r:
print(l)
Result:
b'1st file 1st line \n'
b'1st file 2nd line'
b'2nd file 1st line \n'
b'2nd file 2nd line'
Edit:
:D then there are standard library goodies.
https://docs.python.org/3.7/library/fileinput.html
with fileinput.input(files=('spam.txt', 'eggs.txt')) as f:
for line in f:
process(line)
You could create a class that's an iterator that returns a string each time its __next__() method is called (quoting the docs).
import csv
class ChainedCSVfiles:
def __init__(self, filenames):
self.filenames = filenames
def __iter__(self):
return next(self)
def __next__(self):
for filename in self.filenames:
with open(filename, 'r', newline='') as csvfile:
for line in csvfile:
yield line
filenames = 'segment-1.dat', 'segment-2.dat'
reader = csv.DictReader(ChainedCSVfiles(filenames),
fieldnames=('field1', 'field2', 'field3'))
for row in reader:
print(row)
Related
In python it is easy to read and parse a csv file and process line-by-line:
reader = csv.reader(open("my_csv_file.csv"))
for row in reader:
# row is an array or dict
parsed_data = my_data_parser(row)
where my_data_parser is my own piece of logic that takes input data, parses and does logic.
If my parser fails, I would like to log the entire original line of csv file, but it seems that from the csv reader i have no more access to it.
Is it possible to retrieve the original raw line data?
It doesn't seem like the csv.reader() exposes the file object it's iterating, however, you could use the reader's line_num attribute to achieve what you want.
For example:
import csv
file = open("my_csv_file.csv")
lines = file.readlines()
reader = csv.reader(lines)
for row in reader:
# row is an array or dict
try:
parsed_data = my_data_parser(row)
except MyDataParserError:
print(f"ERROR in line number {reader.line_num}")
print("Full line:")
print(lines[reader.line_num])
file.close()
Alternative
If you'd like to avoid always loading the file into memory, you could instead keep your initial way of reading the file and only read the whole file into memory if an error occurred:
import csv
reader = csv.reader(open("my_csv_file.csv"))
for row in reader:
# row is an array or dict
try:
parsed_data = my_data_parser(row)
except MyDataParserError:
# Only read the whole file into memory when an error occurred.
file = open("my_csv_file.csv")
lines = file.readlines()
file.close()
print(f"ERROR in line number {reader.line_num}")
print("Full line:")
print(lines[reader.line_num])
You can access the row line number with
reader.line_num
But there seems to be no direct way to access the actual line (says doc). Here is iterative method that avoids reading the whole file to memory at any step:
import csv
class MyException(Exception):
pass
def super_logic(line): # Some silly logic to get test code running
if len(line) != 2 or line[1] != '1':
raise MyException("Invalid value")
print("Process: %s" % line)
class LastLineReader:
def __init__(self, fn ):
self.fid = open(fn)
def __iter__(self):
return self
def __next__(self):
line = self.fid.readline() # Read single line and cache it local object
if len(line) == 0:
raise StopIteration()
self.current_line = line.strip()
return line
reader_with_lines = LastLineReader( "my_csv_file.csv" )
reader = csv.reader( reader_with_lines )
for line in reader:
try:
super_logic(line)
except MyException as e:
print("Got exception: %s at line '%s'" % ( e, reader_with_lines.current_line ))
(Edited: removed other solutions as they are also visible on other ppl posts)
As alternative to reader.line_num
for index, row in enumerate(reader):
print(i + 1, row)
I have a big csv file that I need to process and it's done this way (very simplified):
import csv
from csv import excel
def _get_dialect():
class CustomDialect(excel):
delimiter = ','
return CustomDialect()
class DictIter:
def __init__(self):
self.reader = csv.DictReader(open('test.csv'),
fieldnames=['col1', 'col2'],
dialect=_get_dialect())
def __iter__(self):
return self
def __next__(self):
return self.reader.__next__()
items = DictIter()
for item in items:
print(item)
There are better ways to do it but that's what I have right now.
Now I would like to split processing into chunks and found this simple solution that should work for me:
def gen_chunks(reader, chunksize=500):
chunk = []
for i, line in enumerate(reader):
if (i % chunksize == 0 and i > 0):
yield chunk
del chunk[:]
chunk.append(line)
yield chunk
However I'm lacking some generator knowledge to combine these two pieces of code together. Basically I want something like this:
import csv
from csv import excel
def gen_chunks(reader, chunksize=500):
chunk = []
for i, line in enumerate(reader):
if (i % chunksize == 0 and i > 0):
yield chunk
del chunk[:]
chunk.append(line)
yield chunk
def _get_dialect():
class CustomDialect(excel):
delimiter = ','
return CustomDialect()
class DictIter:
def __init__(self):
self.reader = csv.DictReader(open('test.csv'),
fieldnames=['col1', 'col2'],
dialect=_get_dialect())
def __iter__(self):
return self
def __next__(self):
for chunk in gen_chunks(self.reader):
for item in chunk:
yield item
items = DictIter()
for item in items:
print(item)
It might be somewhat clumsy approach but I want to implement splitting processing with minimal changes to the current structure. What I'm trying to achieve is leave the current implementation with iterator class but process one chunk at a time and yield the next chunk when I'm done with the previous one.
Your solution looks like an overly complicated way of doing:
import csv
with open('test.csv',newline='') as f:
reader = csv.DictReader(f,fieldnames=['col1', 'col2'])
for item in reader:
print(item)
Your chunking still returns one item at a time. The default delimiter of the excel dialect is comma, and excel is the default dialect.
Note newline='' is the documented way of opening the file passed to a csv reader or writer object.
If you have a real reason for chunking (multiprocessing?) you should state that and show that attempt if it doesn't work.
I know how to do it for a TXT file, but now I am having some trouble doing it for a CSV file.
How can I read a CSV file from the bottom in Python?
Pretty much the same way as for a text file: read the whole thing into a list and then go backwards:
import csv
with open('test.csv', 'r') as textfile:
for row in reversed(list(csv.reader(textfile))):
print ', '.join(row)
If you want to get fancy, you could write a lot of code that reads blocks starting at the end of the file and working backwards, emitting a line at a time, and then feed that to csv.reader, but that will only work with a file that can be seeked, i.e. disk files but not standard input.
Some of us have files that do not fit into memory, could anyone come with a solution that does not require storing the entire file in memory?
That's a bit trickier. Luckily, all csv.reader expects is an iterator-like object that returns a string (line) per call to next(). So we grab the technique Darius Bacon presented in "Most efficient way to search the last x lines of a file in python" to read the lines of a file backwards, without having to pull in the whole file:
import os
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
and feed reversed_lines into the code to reverse the lines before they get to csv.reader, removing the need for reversed and list:
import csv
with open('test.csv', 'r') as textfile:
for row in csv.reader(reversed_lines(textfile)):
print ', '.join(row)
There is a more Pythonic solution possible, which doesn't require a character-by-character reversal of the block in memory (hint: just get a list of indices where there are line ends in the block, reverse it, and use it to slice the block), and uses chain out of itertools to glue the line clusters from successive blocks together, but that's left as an exercise for the reader.
It's worth noting that the reversed_lines() idiom above only works if the columns in the CSV file don't contain newlines.
Aargh! There's always something. Luckily, it's not too bad to fix this:
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
quoting = False
for block in reversed_blocks(file):
for c in reversed(block):
if c == '"':
quoting = not quoting
elif c == '\n' and part and not quoting:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
Of course, you'll need to change the quote character if your CSV dialect doesn't use ".
Building on #mike-desimone 's answer. Here's a solution that provides the same structure as a python file object but is read in reverse, line by line:
import os
class ReversedFile(object):
def __init__(self, f, mode='r'):
"""
Wraps a file object with methods that make it be read in reverse line-by-line
if ``f`` is a filename opens a new file object
"""
if mode != 'r':
raise ValueError("ReversedFile only supports read mode (mode='r')")
if not type(f) == file:
# likely a filename
f = open(f)
self.file = f
self.lines = self._reversed_lines()
def _reversed_lines(self):
"Generate the lines of file in reverse order."
part = ''
for block in self._reversed_blocks():
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def _reversed_blocks(self, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file = self.file
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
def __getattribute__(self, name):
"""
Allows for the underlying file attributes to come through
"""
try:
# ReversedFile attribute
return super(ReversedFile, self).__getattribute__(name)
except AttributeError:
# self.file attribute
return getattr(self.file, name)
def __iter__(self):
"""
Creates iterator
"""
return self
def seek(self):
raise NotImplementedError('ReversedFile does not support seek')
def next(self):
"""
Next item in the sequence
"""
return self.lines.next()
def read(self):
"""
Returns the entire contents of the file reversed line by line
"""
contents = ''
for line in self:
contents += line
return contents
def readline(self):
"""
Returns the next line from the bottom
"""
return self.next()
def readlines(self):
"""
Returns all remaining lines from the bottom of the file in reverse
"""
return [x for x in self]
Go for it. This is simple program to reverse the rows from a CSV file.
import csv
BC_file = open('Master.csv', 'rb')
BC_reader = csv.reader(BC_file)
next(BC_reader)
for row in reversed(list(BC_reader)):
print row[0]
I have a very very large text file (much larger than can fit in memory). What I would like to do is use something similar to:
for record in myFile:
process_record();
with the added trick that my records are separated by blank lines (with all kinds of stuff in between). For example...
data1
data2,data3,moredata
anotherrecord,otherstuff
yippee
kaiyay
mom
aThird,record:here
How would one iterate through the file in python where each loop iteration accesses a single record from the file?
You can do it with a generator function:
def records(textfile):
record_lines = []
for line in textfile:
if line != '\n':
record_lines.append(line)
else:
yield ''.join(record_lines)
record_lines = []
yield ''.join(record_lines)
for record in records(the_file):
process(record)
You could create an iterator that joins the lines until you find a blank line.
class MyIter:
def __init__(self, infile):
self.infile=infile
def __iter__(self):
return self
def next(self):
lines = []
for line in infile:
line = line.strip()
if len(line) > 0:
lines.append(line)
else:
break
if len(lines)==0:
raise StopIteration
else:
return ",".join(lines)
and try it with
for line in MyIter(infile):
print line
I have a function I am using to read in files of a particular format. My function looks likes this:
import csv
from collections import namedtuple
def read_file(f, name, header=True):
with open(f, mode="r") as infile:
reader = csv.reader(infile, delimiter="\t")
if header is True:
next(reader)
gene_data = namedtuple("Data", 'id, name, q, start, end, sym')
for row in reader:
row = data(*row)
yield row
I also have another type of file that I would like to read in with this function. However, the other file type needs a few slight parsing steps before I can use the read_file function. For example, trailing periods need to be striped from column q and the characters atr need to be appended to the id column. Obviously, I could create a new function, or add some optional arguments to the existing function, but is there a simple way to modify this function so that it can be used to read in an additional file type(s)? I was thinking of something along the lines of a decorator?
IMHO, the most Pythonic way would be converting the function to a base class, split file operations into methods and overriding these methods in new classes based on your base class.
Having such a monolithic function that takes a filename instead of an open file is by itself not very Pythonic. You are trying to implement a stream processor here (file stream -> line stream -> CSV record stream -> [transformator ->] data stream), so using a generator is actually a good idea. I'd slightly refactor this to be a bit more modular:
import csv
from collections import namedtuple
def csv_rows(infile, header):
reader = csv.reader(infile, delimiter="\t")
if header: next(reader)
return reader
def data_sets(infile, header):
gene_data = namedtuple("Data", 'id, name, q, start, end, sym')
for row in csv_rows(infile, header):
yield gene_data(*row)
def read_file_type1(infile, header=True):
# for this file type, we only need to pass the caller the raw
# data objects
return data_sets(infile, header)
def read_file_type2(infile, header=True):
# for this file type, we have to pre-process the data sets
# before yielding them. A good way to express this is using a
# generator expression (we could also add a filtering condition here)
return (transform_data_set(x) for x in data_sets(infile, header))
# Usage sample:
with open("...", "r") as f:
for obj in read_file_type1(f):
print obj
As you can see, we have to pass the header argument all the way through the function chain. This is a strong hint that an object-oriented approach would be appropriate here. The fact that we obviously face a hierarchical type structure here (basic data file, type1, type2) supports this.
I suggest you to create some row iterator like following:
with MyFile('f') as f:
for entry in f:
foo(entry)
You can do this by implementing a class for your own files with the following traits:
with ( http://docs.python.org/reference/compound_stmts.html#the-with-statement )
container ( http://docs.python.org/reference/datamodel.html#emulating-container-types )
Next to it you may create some function open_my_file(filename) that determines the file type and returns propriate file object to work with. This might be slightly enterprise way, but it worth to implement if you're dealing with multiple file types.
The object-oriented way would be this:
class GeneDataReader:
_GeneData = namedtuple('GeneData', 'id, name, q, start, end, sym')
def __init__(self, filename, has_header=True):
self._ignore_1st_row = has_header
self._filename = filename
def __iter__():
for row in self._tsv_by_row():
yield self._GeneData(*self.preprocess_row(row))
def _tsv_by_row(self):
with open(self._filename, 'r') as f:
reader = csv.reader(f, delimiter='\t')
if self._ignore_1st_row:
next(reader)
for row in reader:
yield row
def preprocess_row(self, row):
# does nothing. override in derived classes
return row
class SpecializedGeneDataReader(GeneDataReader):
def preprocess_row(self, row):
row[0] += 'atr'
row[2] = row[2].rstrip('.')
return row
The simplest way would be to modify your currently working code with an extra argument.
def read_file(name, is_special=False, has_header=True):
with open(name,'r') as infile:
reader = csv.reader(infile, delimiter='\t')
if has_header:
next(reader)
Data = namedtuple("Data", 'id, name, q, start, end, sym')
for row in reader:
if is_special:
row[0] += 'atr'
row[2] = row[2].rstrip('.')
row = Data(*row)
yield row
If you are looking for something less nested but still procedure based:
def tsv_by_row(name, has_header=True):
with open(f, 'r') as infile: #
reader = csv.reader(infile, delimiter='\t')
if has_header: next(reader)
for row in reader:
yield row
def gene_data_from_vanilla_file(name, has_header=True):
for row in tsv_by_row(name, has_header):
yield gene_data(*row)
def gene_data_from_special_file(name, has_header=True):
for row in tsv_by_row(name, has_header):
row[0] += 'atr'
row[2] = row[2].rstrip('.')
yield GeneData(*row)
How about passing a callback function to read_file()
In the spirit of Niklas B.'s answer:
import csv, functools
from collections import namedtuple
def consumer(func):
#functools.wraps(func)
def start(*args, **kwargs):
g = func(*args, **kwargs)
g.next()
return g
return start
def csv_rows(infile, header, dest):
reader = csv.reader(infile, delimter='\t')
if header: next(reader)
for line in reader:
dest.send(line)
#consumer
def data_sets(dest):
gene_data = namedtuple("Data", 'id, name, q, start, end, sym')
while 1:
row = (yield)
dest.send(gene_data(*row))
def read_file_1(fn, header=True):
results, sink = getsink()
csv_rows(fn, header, data_sets(sink))
return results
def getsink():
r = []
#consumer
def _sink():
while 1:
x = (yield)
r.append(x)
return (r, _sink())
#consumer
def transform_data_sets(dest):
while True:
data = (yield)
dest.send(data[::-1]) # or whatever
def read_file_2(fn, header=True):
results, sink = getsink()
csv_rows(fn, header, data_sets(transform_data_sets(sink)))
return results