Read CSV file in chunks using generator - python

I have a big csv file that I need to process and it's done this way (very simplified):
import csv
from csv import excel
def _get_dialect():
class CustomDialect(excel):
delimiter = ','
return CustomDialect()
class DictIter:
def __init__(self):
self.reader = csv.DictReader(open('test.csv'),
fieldnames=['col1', 'col2'],
dialect=_get_dialect())
def __iter__(self):
return self
def __next__(self):
return self.reader.__next__()
items = DictIter()
for item in items:
print(item)
There are better ways to do it but that's what I have right now.
Now I would like to split processing into chunks and found this simple solution that should work for me:
def gen_chunks(reader, chunksize=500):
chunk = []
for i, line in enumerate(reader):
if (i % chunksize == 0 and i > 0):
yield chunk
del chunk[:]
chunk.append(line)
yield chunk
However I'm lacking some generator knowledge to combine these two pieces of code together. Basically I want something like this:
import csv
from csv import excel
def gen_chunks(reader, chunksize=500):
chunk = []
for i, line in enumerate(reader):
if (i % chunksize == 0 and i > 0):
yield chunk
del chunk[:]
chunk.append(line)
yield chunk
def _get_dialect():
class CustomDialect(excel):
delimiter = ','
return CustomDialect()
class DictIter:
def __init__(self):
self.reader = csv.DictReader(open('test.csv'),
fieldnames=['col1', 'col2'],
dialect=_get_dialect())
def __iter__(self):
return self
def __next__(self):
for chunk in gen_chunks(self.reader):
for item in chunk:
yield item
items = DictIter()
for item in items:
print(item)
It might be somewhat clumsy approach but I want to implement splitting processing with minimal changes to the current structure. What I'm trying to achieve is leave the current implementation with iterator class but process one chunk at a time and yield the next chunk when I'm done with the previous one.

Your solution looks like an overly complicated way of doing:
import csv
with open('test.csv',newline='') as f:
reader = csv.DictReader(f,fieldnames=['col1', 'col2'])
for item in reader:
print(item)
Your chunking still returns one item at a time. The default delimiter of the excel dialect is comma, and excel is the default dialect.
Note newline='' is the documented way of opening the file passed to a csv reader or writer object.
If you have a real reason for chunking (multiprocessing?) you should state that and show that attempt if it doesn't work.

Related

Python csv get original raw data line

In python it is easy to read and parse a csv file and process line-by-line:
reader = csv.reader(open("my_csv_file.csv"))
for row in reader:
# row is an array or dict
parsed_data = my_data_parser(row)
where my_data_parser is my own piece of logic that takes input data, parses and does logic.
If my parser fails, I would like to log the entire original line of csv file, but it seems that from the csv reader i have no more access to it.
Is it possible to retrieve the original raw line data?
It doesn't seem like the csv.reader() exposes the file object it's iterating, however, you could use the reader's line_num attribute to achieve what you want.
For example:
import csv
file = open("my_csv_file.csv")
lines = file.readlines()
reader = csv.reader(lines)
for row in reader:
# row is an array or dict
try:
parsed_data = my_data_parser(row)
except MyDataParserError:
print(f"ERROR in line number {reader.line_num}")
print("Full line:")
print(lines[reader.line_num])
file.close()
Alternative
If you'd like to avoid always loading the file into memory, you could instead keep your initial way of reading the file and only read the whole file into memory if an error occurred:
import csv
reader = csv.reader(open("my_csv_file.csv"))
for row in reader:
# row is an array or dict
try:
parsed_data = my_data_parser(row)
except MyDataParserError:
# Only read the whole file into memory when an error occurred.
file = open("my_csv_file.csv")
lines = file.readlines()
file.close()
print(f"ERROR in line number {reader.line_num}")
print("Full line:")
print(lines[reader.line_num])
You can access the row line number with
reader.line_num
But there seems to be no direct way to access the actual line (says doc). Here is iterative method that avoids reading the whole file to memory at any step:
import csv
class MyException(Exception):
pass
def super_logic(line): # Some silly logic to get test code running
if len(line) != 2 or line[1] != '1':
raise MyException("Invalid value")
print("Process: %s" % line)
class LastLineReader:
def __init__(self, fn ):
self.fid = open(fn)
def __iter__(self):
return self
def __next__(self):
line = self.fid.readline() # Read single line and cache it local object
if len(line) == 0:
raise StopIteration()
self.current_line = line.strip()
return line
reader_with_lines = LastLineReader( "my_csv_file.csv" )
reader = csv.reader( reader_with_lines )
for line in reader:
try:
super_logic(line)
except MyException as e:
print("Got exception: %s at line '%s'" % ( e, reader_with_lines.current_line ))
(Edited: removed other solutions as they are also visible on other ppl posts)
As alternative to reader.line_num
for index, row in enumerate(reader):
print(i + 1, row)

How to chain file objects in Python?

I'm trying to find a simple way to chain file-like objects. I have a single CSV file which is split into a number of segments on disk. I'd like to be able to pass them to csv.DictReader without having to make a concatenated temporary first.
Something like:
files = map(io.open, filenames)
for row in csv.DictReader(io.chain(files)):
print(row[column_name])
But I haven't been able to find anything like io.chain. If I were parsing it myself, I could do something like:
from itertools import chain
def lines(fp):
for line in fp.readlines():
yield line
a = open('segment-1.dat')
b = open('segment-2.dat')
for line in chain(lines(a), lines(b)):
row = line.strip().split(',')
However DictReader needs something it can call read() on, so this method doesn't work. I can iterate over the files, copying the fieldnames property from the previous reader, but I was hoping for something which let me put all the processing within a single loop body.
An iterable might help
from io import BytesIO
a = BytesIO(b"1st file 1st line \n1st file 2nd line")
b = BytesIO(b"2nd file 1st line \n2nd file 2nd line")
class Reader:
def __init__(self, *files):
self.files = files
self.current_idx = 0
def __iter__(self):
return self
def __next__(self):
f = self.files[self.current_idx]
for line in f:
return line
else:
if self.current_idx < len(self.files) - 1:
self.current_idx += 1
return next (self)
raise StopIteration("feed me more files")
r = Reader(a, b)
for l in r:
print(l)
Result:
b'1st file 1st line \n'
b'1st file 2nd line'
b'2nd file 1st line \n'
b'2nd file 2nd line'
Edit:
:D then there are standard library goodies.
https://docs.python.org/3.7/library/fileinput.html
with fileinput.input(files=('spam.txt', 'eggs.txt')) as f:
for line in f:
process(line)
You could create a class that's an iterator that returns a string each time its __next__() method is called (quoting the docs).
import csv
class ChainedCSVfiles:
def __init__(self, filenames):
self.filenames = filenames
def __iter__(self):
return next(self)
def __next__(self):
for filename in self.filenames:
with open(filename, 'r', newline='') as csvfile:
for line in csvfile:
yield line
filenames = 'segment-1.dat', 'segment-2.dat'
reader = csv.DictReader(ChainedCSVfiles(filenames),
fieldnames=('field1', 'field2', 'field3'))
for row in reader:
print(row)

Iterator using itertools is skipping a line

I have the feeling that my question is related to Why does takewhile() skip the first line?
I haven't found satisfactory answers in there though.
My examples below use the following modules
import csv
from itertools import takewhile
Here is my problem.
I have a csv file which I want to parse using itertools.
For instance, i want to separate the header from the content.
This is spotted by the presence of a keyword in the first column.
Here is file.csv example
a, content
b, content
KEYWORD, something else
c, let's continue
The two first lines compose the header of the file.
The KEYWORD line separates it from the content: the last line.
Even, if it is not properly part of the content, I want to parse the separation row.
with open('file.csv', 'rb') as f:
reader = csv.reader(f)
header = takewhile(lambda x: x[0] != 'KEYWORD', reader)
for row in header:
print(row)
print('End of header')
for row in reader:
print(row)
I was not expecting this, but the KEYWORD line is skipped.
As you will see in the following output:
['a', ' content']
['b', ' content']
End of header
['c', " let's continue"]
I have tried simulating the csv reader to see if it was coming from there.
But apparently not.
The following code produces the same behavior.
l = [['a', 'content'],
['b','content'],
['KEYWORD', 'something else'],
['c', "let's continue"]]
i = iter(l)
header = takewhile(lambda x: x[0] != 'KEYWORD', i)
for row in header:
print(row)
print('End of header')
for row in i:
print(row)
How can I do to use the feature of takewhile, while preventing the following for the skip the unparsed line ?
As I have understood, the first for calls for next on the iterator, to test its content.
The second calls for next once again, to gather the value.
And the separation row is hence skipped.
I think you will have to restructure - takewhile isn't a good fit for what you are doing. The problem is that takewhile has to read the line starting 'KEYWORD' to determine that it has reached a line it shouldn't take, and once the line is read the file's "read head" is at the start of the next line. Similarly, with iter, takewhile has already consumed (but discarded) the line starting 'KEYWORD' when you start for row in i.
One alternative would be something like:
header = []
content = []
target = header
for row in reader:
if line.startswith('KEYWORD'):
target = content
target.append(row)
You can write your own takewhile like this.
def takewhile(predicate, iterable):
for x in iterable:
yield x
if not predicate(x):
break
test:
>>> list(takewhile(lambda x:x!=3, range(10)))
[0, 1, 2, 3]
Thanks to #jonrsharpe, I came to question myself on some trick to code.
Here is what I reached :
class RewindableFile(file):
def __init__(self, *args, **kwargs):
nb_backup = kwargs.pop('nb_backup', 1)
super(RewindableFile, self).__init__(*args, **kwargs)
self._nb_backup = nb_backup
self._backups = []
self._time_anchor = 0
def next(self):
if self._time_anchor >= 0:
item = super(RewindableFile, self).next()
self._backup(item)
return item
else:
item = self._forward()
return item
def rewind(self):
self._time_anchor = self._time_anchor - 1
time_bound = min(self._nb_backup, len(self._backups))
if self._time_anchor < -time_bound:
raise Exception('You have gone too far in history...')
def __iter__(self):
return self
def _backup(self, row):
self._backups.append(row)
extra_items = len(self._backups) - self._nb_backup
if extra_items > 0:
del self._backups[0:extra_items]
def _forward(self):
item = self._backups[self._time_anchor]
self._time_anchor = self._time_anchor + 1
return item
And how I use it :
with RewindableFile('csv.csv', 'rb') as f:
def test_kwd_and_rewind(x):
if x[0] != 'KEYWORD':
return True
else:
f.rewind()
return False
reader = csv.reader(f)
header = takewhile(test_kwd_and_rewind, reader)
for row in header:
print(row)
print('End of header')
for row in reader:
print(row)
I could have also overload read and readline functions to save the jump.
But I don't need them here.
jonrsharpe has it right. This isn't quite a job for takewhile. itertools also has a groupby function which can more easily handle the splitting. The LastHeaderclass below keeps a record of the last header line passed through the check method, and returns a reference to it each time check is called.
This lets you run through the file a single time, without having to backtrack any.
class LastHeader():
"""Checks for new header strings. For use with groupby"""
def __init__(self, sentinel='#'):
self.sentinel = sentinel
self.lastheader = ''
def check(self, line):
if line.startswith(self.sentinel):
self.lastheader = line
return self.lastheader
with open(fname, 'r') as fobj:
lastheader = LastHeader(sentinel)
for headerline, readlines in groupby(fobj, lastheader.check):
foo(headerline)
for line in readlines:
bar(line)
where foo and bar are whatever processing you need to do on the headers and data.

How to read a CSV file in reverse order in Python?

I know how to do it for a TXT file, but now I am having some trouble doing it for a CSV file.
How can I read a CSV file from the bottom in Python?
Pretty much the same way as for a text file: read the whole thing into a list and then go backwards:
import csv
with open('test.csv', 'r') as textfile:
for row in reversed(list(csv.reader(textfile))):
print ', '.join(row)
If you want to get fancy, you could write a lot of code that reads blocks starting at the end of the file and working backwards, emitting a line at a time, and then feed that to csv.reader, but that will only work with a file that can be seeked, i.e. disk files but not standard input.
Some of us have files that do not fit into memory, could anyone come with a solution that does not require storing the entire file in memory?
That's a bit trickier. Luckily, all csv.reader expects is an iterator-like object that returns a string (line) per call to next(). So we grab the technique Darius Bacon presented in "Most efficient way to search the last x lines of a file in python" to read the lines of a file backwards, without having to pull in the whole file:
import os
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
and feed reversed_lines into the code to reverse the lines before they get to csv.reader, removing the need for reversed and list:
import csv
with open('test.csv', 'r') as textfile:
for row in csv.reader(reversed_lines(textfile)):
print ', '.join(row)
There is a more Pythonic solution possible, which doesn't require a character-by-character reversal of the block in memory (hint: just get a list of indices where there are line ends in the block, reverse it, and use it to slice the block), and uses chain out of itertools to glue the line clusters from successive blocks together, but that's left as an exercise for the reader.
It's worth noting that the reversed_lines() idiom above only works if the columns in the CSV file don't contain newlines.
Aargh! There's always something. Luckily, it's not too bad to fix this:
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
quoting = False
for block in reversed_blocks(file):
for c in reversed(block):
if c == '"':
quoting = not quoting
elif c == '\n' and part and not quoting:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
Of course, you'll need to change the quote character if your CSV dialect doesn't use ".
Building on #mike-desimone 's answer. Here's a solution that provides the same structure as a python file object but is read in reverse, line by line:
import os
class ReversedFile(object):
def __init__(self, f, mode='r'):
"""
Wraps a file object with methods that make it be read in reverse line-by-line
if ``f`` is a filename opens a new file object
"""
if mode != 'r':
raise ValueError("ReversedFile only supports read mode (mode='r')")
if not type(f) == file:
# likely a filename
f = open(f)
self.file = f
self.lines = self._reversed_lines()
def _reversed_lines(self):
"Generate the lines of file in reverse order."
part = ''
for block in self._reversed_blocks():
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def _reversed_blocks(self, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file = self.file
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
def __getattribute__(self, name):
"""
Allows for the underlying file attributes to come through
"""
try:
# ReversedFile attribute
return super(ReversedFile, self).__getattribute__(name)
except AttributeError:
# self.file attribute
return getattr(self.file, name)
def __iter__(self):
"""
Creates iterator
"""
return self
def seek(self):
raise NotImplementedError('ReversedFile does not support seek')
def next(self):
"""
Next item in the sequence
"""
return self.lines.next()
def read(self):
"""
Returns the entire contents of the file reversed line by line
"""
contents = ''
for line in self:
contents += line
return contents
def readline(self):
"""
Returns the next line from the bottom
"""
return self.next()
def readlines(self):
"""
Returns all remaining lines from the bottom of the file in reverse
"""
return [x for x in self]
Go for it. This is simple program to reverse the rows from a CSV file.
import csv
BC_file = open('Master.csv', 'rb')
BC_reader = csv.reader(BC_file)
next(BC_reader)
for row in reversed(list(BC_reader)):
print row[0]

What is the most Pythonic way to modify the function of a function?

I have a function I am using to read in files of a particular format. My function looks likes this:
import csv
from collections import namedtuple
def read_file(f, name, header=True):
with open(f, mode="r") as infile:
reader = csv.reader(infile, delimiter="\t")
if header is True:
next(reader)
gene_data = namedtuple("Data", 'id, name, q, start, end, sym')
for row in reader:
row = data(*row)
yield row
I also have another type of file that I would like to read in with this function. However, the other file type needs a few slight parsing steps before I can use the read_file function. For example, trailing periods need to be striped from column q and the characters atr need to be appended to the id column. Obviously, I could create a new function, or add some optional arguments to the existing function, but is there a simple way to modify this function so that it can be used to read in an additional file type(s)? I was thinking of something along the lines of a decorator?
IMHO, the most Pythonic way would be converting the function to a base class, split file operations into methods and overriding these methods in new classes based on your base class.
Having such a monolithic function that takes a filename instead of an open file is by itself not very Pythonic. You are trying to implement a stream processor here (file stream -> line stream -> CSV record stream -> [transformator ->] data stream), so using a generator is actually a good idea. I'd slightly refactor this to be a bit more modular:
import csv
from collections import namedtuple
def csv_rows(infile, header):
reader = csv.reader(infile, delimiter="\t")
if header: next(reader)
return reader
def data_sets(infile, header):
gene_data = namedtuple("Data", 'id, name, q, start, end, sym')
for row in csv_rows(infile, header):
yield gene_data(*row)
def read_file_type1(infile, header=True):
# for this file type, we only need to pass the caller the raw
# data objects
return data_sets(infile, header)
def read_file_type2(infile, header=True):
# for this file type, we have to pre-process the data sets
# before yielding them. A good way to express this is using a
# generator expression (we could also add a filtering condition here)
return (transform_data_set(x) for x in data_sets(infile, header))
# Usage sample:
with open("...", "r") as f:
for obj in read_file_type1(f):
print obj
As you can see, we have to pass the header argument all the way through the function chain. This is a strong hint that an object-oriented approach would be appropriate here. The fact that we obviously face a hierarchical type structure here (basic data file, type1, type2) supports this.
I suggest you to create some row iterator like following:
with MyFile('f') as f:
for entry in f:
foo(entry)
You can do this by implementing a class for your own files with the following traits:
with ( http://docs.python.org/reference/compound_stmts.html#the-with-statement )
container ( http://docs.python.org/reference/datamodel.html#emulating-container-types )
Next to it you may create some function open_my_file(filename) that determines the file type and returns propriate file object to work with. This might be slightly enterprise way, but it worth to implement if you're dealing with multiple file types.
The object-oriented way would be this:
class GeneDataReader:
_GeneData = namedtuple('GeneData', 'id, name, q, start, end, sym')
def __init__(self, filename, has_header=True):
self._ignore_1st_row = has_header
self._filename = filename
def __iter__():
for row in self._tsv_by_row():
yield self._GeneData(*self.preprocess_row(row))
def _tsv_by_row(self):
with open(self._filename, 'r') as f:
reader = csv.reader(f, delimiter='\t')
if self._ignore_1st_row:
next(reader)
for row in reader:
yield row
def preprocess_row(self, row):
# does nothing. override in derived classes
return row
class SpecializedGeneDataReader(GeneDataReader):
def preprocess_row(self, row):
row[0] += 'atr'
row[2] = row[2].rstrip('.')
return row
The simplest way would be to modify your currently working code with an extra argument.
def read_file(name, is_special=False, has_header=True):
with open(name,'r') as infile:
reader = csv.reader(infile, delimiter='\t')
if has_header:
next(reader)
Data = namedtuple("Data", 'id, name, q, start, end, sym')
for row in reader:
if is_special:
row[0] += 'atr'
row[2] = row[2].rstrip('.')
row = Data(*row)
yield row
If you are looking for something less nested but still procedure based:
def tsv_by_row(name, has_header=True):
with open(f, 'r') as infile: #
reader = csv.reader(infile, delimiter='\t')
if has_header: next(reader)
for row in reader:
yield row
def gene_data_from_vanilla_file(name, has_header=True):
for row in tsv_by_row(name, has_header):
yield gene_data(*row)
def gene_data_from_special_file(name, has_header=True):
for row in tsv_by_row(name, has_header):
row[0] += 'atr'
row[2] = row[2].rstrip('.')
yield GeneData(*row)
How about passing a callback function to read_file()
In the spirit of Niklas B.'s answer:
import csv, functools
from collections import namedtuple
def consumer(func):
#functools.wraps(func)
def start(*args, **kwargs):
g = func(*args, **kwargs)
g.next()
return g
return start
def csv_rows(infile, header, dest):
reader = csv.reader(infile, delimter='\t')
if header: next(reader)
for line in reader:
dest.send(line)
#consumer
def data_sets(dest):
gene_data = namedtuple("Data", 'id, name, q, start, end, sym')
while 1:
row = (yield)
dest.send(gene_data(*row))
def read_file_1(fn, header=True):
results, sink = getsink()
csv_rows(fn, header, data_sets(sink))
return results
def getsink():
r = []
#consumer
def _sink():
while 1:
x = (yield)
r.append(x)
return (r, _sink())
#consumer
def transform_data_sets(dest):
while True:
data = (yield)
dest.send(data[::-1]) # or whatever
def read_file_2(fn, header=True):
results, sink = getsink()
csv_rows(fn, header, data_sets(transform_data_sets(sink)))
return results

Categories

Resources