I have a function which processes binary data from a file using file.read(len) method. However, my file is huge and is cut into many smaller files 50 MBytes each. Is there some wrapper class that feeds many files into a buffered stream, and provides a read() method?
Class fileinput.FileInput can do such a thing, but it supports only line-by-line reading (method readline() with no arguments) and does not have read(len) with specifying number of bytes to read.
It's quite easy to concatenate iterables with itertools.chain:
from itertools import chain
def read_by_chunks(file_objects, block_size=1024):
readers = (iter(lambda f=f: f.read(block_size), '') for f in file_objects)
return chain.from_iterable(readers)
You can then do:
for chunk in read_by_chunks([f1, f2, f3, f4], 4096):
handle(chunk)
To process the files in sequence while reading it by chunks of 4096 bytes.
If you need to provide an object with a read method because some other function expects that you can write a very simple wrapper:
class ConcatFiles(object):
def __init__(self, files, block_size):
self._reader = read_by_chunks(files, block_size)
def __iter__(self):
return self._reader
def read(self):
return next(self._reader, '')
This however only uses a fixed block size. It's possible to support the block_size parameter for the read by doing something like:
def read(self, block_size=None):
block_size = block_size or self._block_size
total_read = 0
chunks = []
for chunk in self._reader:
chunks.append(chunk)
total_read += len(chunk)
if total_read > block_size:
contents = ''.join(chunks)
self._reader = chain([contents[block_size:]], self._reader)
return contents[:block_size]
return ''.join(chunks)
Note: if you are reading in binary mode you should replace the empty strings '' in the code with empty bytes b''.
Instead of converting the list of streams into a generator - as some of the other answers do - you can chain the streams together and then use the file interface:
def chain_streams(streams, buffer_size=io.DEFAULT_BUFFER_SIZE):
"""
Chain an iterable of streams together into a single buffered stream.
Usage:
def generate_open_file_streams():
for file in filenames:
yield open(file, 'rb')
f = chain_streams(generate_open_file_streams())
f.read()
"""
class ChainStream(io.RawIOBase):
def __init__(self):
self.leftover = b''
self.stream_iter = iter(streams)
try:
self.stream = next(self.stream_iter)
except StopIteration:
self.stream = None
def readable(self):
return True
def _read_next_chunk(self, max_length):
# Return 0 or more bytes from the current stream, first returning all
# leftover bytes. If the stream is closed returns b''
if self.leftover:
return self.leftover
elif self.stream is not None:
return self.stream.read(max_length)
else:
return b''
def readinto(self, b):
buffer_length = len(b)
chunk = self._read_next_chunk(buffer_length)
while len(chunk) == 0:
# move to next stream
if self.stream is not None:
self.stream.close()
try:
self.stream = next(self.stream_iter)
chunk = self._read_next_chunk(buffer_length)
except StopIteration:
# No more streams to chain together
self.stream = None
return 0 # indicate EOF
output, self.leftover = chunk[:buffer_length], chunk[buffer_length:]
b[:len(output)] = output
return len(output)
return io.BufferedReader(ChainStream(), buffer_size=buffer_size)
Then use it as any other file/stream:
f = chain_streams(open_files_or_chunks)
f.read(len)
I'm not familiar with anything in the standard library that performs that function, so, in case there is none:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
class ConcatenatedFiles( object ):
def __init__(self, file_objects):
self.fds= list(reversed(file_objects))
def read( self, size=None ):
remaining= size
data= StringIO()
while self.fds and (remaining>0 or remaining is None):
data_read= self.fds[-1].read(remaining or -1)
if len(data_read)<remaining or remaining is None: #exhausted file
self.fds.pop()
if not remaining is None:
remaining-=len(data_read)
data.write(data_read)
return data.getvalue()
Another method would be to use a generator:
def read_iter(streams, block_size=1024):
for stream in streams:
for chunk in stream.read(block_size):
yield chunk
# open file handles
file1 = open('f1.txt', 'r')
file2 = open('f2.txt', 'r')
fileOut = open('out.txt', 'w')
# concatenate files 1 & 2
for chunk in read_iter([file1, file2]):
# process chunk (in this case, just concatenate to output)
fileOut.write(chunk)
# close files
file1.close()
file2.close()
fileOut.close()
This shouldn't consume any memory beyond that required by the base script, and the chunk size; it's passing each chunk straight from one file reader, to the writer of another, then repeating until all streams are complete.
If you need this behaviour in a class, this could easily be build into a container class, as Bakuriu describes.
Related
I have a large file that is 143mb in size. I want to split the file into smaller files that are 2.5mb in size, put them into a directory and return the file names. The way I'm attempting to do this is with a generator:
def gen_read(filename, chunk=1024*8):
with open(filename, "rb") as f:
for part in iter(lambda: f.read(chunk), b''):
yield part
The goal is to take this generator and read the file into parts from there write each part into a temporary filename until the file is 2.5mb in size and add to the extension of the temporary file to have a sort of list of them. I'm trying to do so via this function:
API_TEMP_FILE_PATH = "/tmp"
def random_filename(length=10):
s = ""
acceptable = string.ascii_letters
for _ in range(length):
s += random.choice(acceptable)
return s
def split_file(filename, bytes_limit=2621440):
split_files = []
file_ext_number = 1
tmp_filename = random_filename(length=32)
do_break = False
while not do_break:
file_path = "{}/{}_split_file.part_{}".format(API_TEMP_FILE_PATH, tmp_filename, file_ext_number)
stream = gen_read(filename)
for part in next(stream):
if not part:
do_break = True
if os.path.exists(file_path):
size = os.stat(file_path).st_size
if size > bytes_limit:
file_ext_number += 1
with open(file_path, 'wb') as dest:
dest.write(part)
searcher = re.compile('{}\_split\_file\.part\_\d(\d+)?'.format(tmp_filename))
for filename in os.listdir(API_TEMP_FILE_PATH):
if searcher.search(filename) is not None:
split_files.append("{}/{}".format(API_TEMP_FILE_PATH, filename))
return split_files
The issue I'm running into is that my generator is only producing 1 "character" at a time (can be seen by adding print(repr(part)) right underneath the for part):
...
'\x10'
'\x00'
'\x00'
'\x00'
'\x00'
'\x05'
'\x00'
'\x00'
'\x10'
...
As for that, the file size never changes from 1. What am I doing wrong to where this file split function isn't working as expected?
I figured it out, instead of using a generator I just read into the file a certain amount:
def split_file(filename, bytes_limit=2621440):
split_files = []
file_ext_number = 1
tmp_filename = random_filename(None, length=32, is_pcap=False)
file_path = "{}/{}_split_file.part_".format(API_TEMP_FILE_PATH, tmp_filename)
with open(filename, "rb") as source:
byte = source.read(bytes_limit)
while byte:
open(file_path + "{}".format(file_ext_number), 'wb').write(byte)
byte = source.read(bytes_limit)
file_ext_number += 1
searcher = re.compile('{}\_split\_file\.part\_\d(\d+)?'.format(tmp_filename))
for filename in os.listdir(API_TEMP_FILE_PATH):
if searcher.search(filename) is not None:
split_files.append("{}/{}".format(API_TEMP_FILE_PATH, filename))
return split_files
It produces all the correct files
I'm trying to read a huge csv.gz file from a url into chunks and write it into a database on the fly. I have to do all this in memory, no data can exist on disk.
I have the below generator function that generates the response chunks into Dataframe objects.
It works using the request's response.raw as input for the pd.read_csv function, but it appears unreliable and can sometimes throw the timeout error: urllib3.exceptions.ProtocolError: ('Connection broken: OSError("(10054, \'WSAECONNRESET\')",)', OSError("(10054, 'WSAECONNRESET')",))
response = session.get(target, stream=True)
df_it = pd.read_csv(response.raw, compression='gzip', chunksize=10**6,
header=None, dtype=str, names=columns, parse_dates=['datetime'])
for i, df in enumerate(self.process_df(df_it)):
if df.empty:
continue
if (i % 10) == 0:
time.sleep(10)
yield df
I decided to use iter_content instead, as I read it should be more reliable. I have implemented the below functionality, but I'm getting this error: EOFError: Compressed file ended before the end-of-stream marker was reached.
I think it's to do with the fact I'm passing in a compressed Bytes object (?) but I'm not sure how to pass pandas.read_csv an object it will accept.
response = session.get(target, stream=True)
for chunk in response.iter_content(chunk_size=10**6):
file_obj = io.BytesIO()
file_obj.write(chunk)
file_obj.seek(0)
df_it = pd.read_csv(file_obj, compression='gzip', dtype=str,
header=None, names=columns, parse_dates=['datetime'])
for i, df in enumerate(self.process_df(df_it)):
if df.empty:
continue
if (i % 10) == 0:
time.sleep(10)
yield df
Any ideas greatly appreciated !
Thanks
You may wish to try this:
def iterable_to_stream(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
"""
Lets you use an iterable (e.g. a generator) that yields bytestrings as a read-only
input stream.
The stream implements Python 3's newer I/O API (available in Python 2's io module).
For efficiency, the stream is buffered.
"""
class IterStream(io.RawIOBase):
def __init__(self):
self.leftover = None
def readable(self):
return True
def readinto(self, b):
try:
l = len(b) # We're supposed to return at most this much
chunk = self.leftover or next(iterable)
output, self.leftover = chunk[:l], chunk[l:]
b[:len(output)] = output
return len(output)
except StopIteration:
return 0 # indicate EOF
return io.BufferedReader(IterStream(), buffer_size=buffer_size)
Then
response = session.get(target, stream=True)
response.raw.decode_content = decode
df = pd.read_csv(iterable_to_stream(response.iter_content()), sep=';')
I use this to stream csv files in odsclient. It seems to work, although I did not try with gz compression.
Source: https://stackoverflow.com/a/20260030/7262247
I am looking to get the first N mb of a file. Here is a basic implementation:
def get_first_n_mb(self, file=None, n=5):
"""
Will return the first 5 (or N) MB of the passed file
"""
file = file or self.file
with open(file, 'rb') as fp:
file_data = self.file_first_n_mb = fp.read(1e6 * n)
return file_data
However, the user may pass a large number, such as n = 1000, in which case we would want to chunk the read. What would be a good 'size' to do the chunk, or would the above approach still work? How could it be improved?
read() is permitted to return less than the amount you asked for. You should call it in a loop until you reach the amount requested or EOF. You need to keep reducing the amount you need to read by the size of the last read.
def get_first_n_mb(self, file=None, n=5):
file = file or self.file
amt = 1e6 * n
file_data = ''
with open(file, 'rb') as fp:
while amt > 0:
try:
block = fp.read(amt)
file_data += block
amt -= len(block)
except EOFError:
break
return file_data
For ordinary files read() will normally return as much as you request, as long as the file is that long. But other types of streams will often return less (e.g. reading from a terminal will usually just return one line).
try:
content = open("/tmp/out").read()
except:
content = ""
Can I go any shorter or more elegant than this? I've to do it for more than one files so I want something more short.
Is writing function the only shorter way to do it?
What I actually want is this but I want to concat "" if there is any exception
lines = (open("/var/log/log.1").read() + open("/var/log/log").read()).split("\n")
Yes, you'll have to write something like
def get_contents(filename):
try:
with open(filename) as f:
return f.read()
except EnvironmentError:
return ''
lines = (get_contents('/var/log/log.1')
+ get_contents('/var/log/log')).split('\n')
NlightNFotis raises a valid point, if the files are big, you don't want to do this. Maybe you'd write a line generator that accepts a list of filenames:
def get_lines(filenames):
for fname in filenames:
try:
with open(fname) as f:
for line in f:
yield line
except EnvironmentError:
continue
...
for line in get_lines(["/var/log/log.1", "/var/log/log"]):
do_stuff(line)
Another way is to use the standard fileinput.FileInput class (thanks, J.F. Sebastian):
import fileinput
def eat_errors(f, mode):
try:
return open(f, mode)
except IOError:
return open(os.devnull)
for line in fileinput.FileInput(["/var/log/log.1", "/var/log/log"], openhook=eat_errors):
do_stuff(line)
This code will monkey patch out open for another open that creates a FakeFile that always returns a "empty" string if open throws an `IOException``.
Whilst it's more code than you'd really want to write for the problem at hand, it does mean that you have a reusable context manager for faking open if the need arises again (probably twice in the next decade)
with monkey_patched_open():
...
Actual code.
#!/usr/bin/env python
from contextlib import contextmanager
from StringIO import StringIO
################################################################################
class FakeFile(StringIO):
def __init__(self):
StringIO.__init__(self)
self.count = 0
def read(self, n=-1):
return "<empty#1>"
def readlines(self, sizehint = 0):
return ["<empty#2>"]
def next(self):
if self.count == 0:
self.count += 1
return "<empty#3>"
else:
raise StopIteration
################################################################################
#contextmanager
def monkey_patched_open():
global open
old_open = open
def new_fake_open(filename, mode="r"):
try:
fh = old_open(filename, mode)
except IOError:
fh = FakeFile()
return fh
open = new_fake_open
try:
yield
finally:
open = old_open
################################################################################
with monkey_patched_open():
for line in open("NOSUCHFILE"):
print "NOSUCHFILE->", line
print "Other", open("MISSING").read()
print "OK", open(__file__).read()[:30]
Running the above gives:
NOSUCHFILE-> <empty#3>
Other <empty#1>
OK #!/usr/bin/env python
from co
I left in the "empty" strings just to show what was happening.
StringIO would have sufficed just to read it once but I thought the OP was looking to keep reading from file, hence the need for FakeFile - unless someone knows of a better mechanism.
I know some see monkey patching as the act of a scoundrel.
You could try the following, but it's probably not the best:
import os
def chk_file(filename):
if os.stat(filename).st_size == 0:
return ""
else:
with open(filename) as f:
return f.readlines()
if __name__=="__main__":
print chk_file("foobar.txt") #populated file
print chk_file("bar.txt") #empty file
print chk_file("spock.txt") #populated
It works. You can wrap it with your try-except, if you want.
You could define a function to catch errors:
from itertools import chain
def readlines(filename):
try:
with open(filename) as file:
return file.readlines() # or just `file` to return an iterator
except EnvironmentError:
return []
files = (readlines(name) for name in ["/var/log/1", "/var/log/2"])
lines = list(chain.from_iterable(files))
I know how to do it for a TXT file, but now I am having some trouble doing it for a CSV file.
How can I read a CSV file from the bottom in Python?
Pretty much the same way as for a text file: read the whole thing into a list and then go backwards:
import csv
with open('test.csv', 'r') as textfile:
for row in reversed(list(csv.reader(textfile))):
print ', '.join(row)
If you want to get fancy, you could write a lot of code that reads blocks starting at the end of the file and working backwards, emitting a line at a time, and then feed that to csv.reader, but that will only work with a file that can be seeked, i.e. disk files but not standard input.
Some of us have files that do not fit into memory, could anyone come with a solution that does not require storing the entire file in memory?
That's a bit trickier. Luckily, all csv.reader expects is an iterator-like object that returns a string (line) per call to next(). So we grab the technique Darius Bacon presented in "Most efficient way to search the last x lines of a file in python" to read the lines of a file backwards, without having to pull in the whole file:
import os
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
and feed reversed_lines into the code to reverse the lines before they get to csv.reader, removing the need for reversed and list:
import csv
with open('test.csv', 'r') as textfile:
for row in csv.reader(reversed_lines(textfile)):
print ', '.join(row)
There is a more Pythonic solution possible, which doesn't require a character-by-character reversal of the block in memory (hint: just get a list of indices where there are line ends in the block, reverse it, and use it to slice the block), and uses chain out of itertools to glue the line clusters from successive blocks together, but that's left as an exercise for the reader.
It's worth noting that the reversed_lines() idiom above only works if the columns in the CSV file don't contain newlines.
Aargh! There's always something. Luckily, it's not too bad to fix this:
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
quoting = False
for block in reversed_blocks(file):
for c in reversed(block):
if c == '"':
quoting = not quoting
elif c == '\n' and part and not quoting:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
Of course, you'll need to change the quote character if your CSV dialect doesn't use ".
Building on #mike-desimone 's answer. Here's a solution that provides the same structure as a python file object but is read in reverse, line by line:
import os
class ReversedFile(object):
def __init__(self, f, mode='r'):
"""
Wraps a file object with methods that make it be read in reverse line-by-line
if ``f`` is a filename opens a new file object
"""
if mode != 'r':
raise ValueError("ReversedFile only supports read mode (mode='r')")
if not type(f) == file:
# likely a filename
f = open(f)
self.file = f
self.lines = self._reversed_lines()
def _reversed_lines(self):
"Generate the lines of file in reverse order."
part = ''
for block in self._reversed_blocks():
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def _reversed_blocks(self, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file = self.file
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
def __getattribute__(self, name):
"""
Allows for the underlying file attributes to come through
"""
try:
# ReversedFile attribute
return super(ReversedFile, self).__getattribute__(name)
except AttributeError:
# self.file attribute
return getattr(self.file, name)
def __iter__(self):
"""
Creates iterator
"""
return self
def seek(self):
raise NotImplementedError('ReversedFile does not support seek')
def next(self):
"""
Next item in the sequence
"""
return self.lines.next()
def read(self):
"""
Returns the entire contents of the file reversed line by line
"""
contents = ''
for line in self:
contents += line
return contents
def readline(self):
"""
Returns the next line from the bottom
"""
return self.next()
def readlines(self):
"""
Returns all remaining lines from the bottom of the file in reverse
"""
return [x for x in self]
Go for it. This is simple program to reverse the rows from a CSV file.
import csv
BC_file = open('Master.csv', 'rb')
BC_reader = csv.reader(BC_file)
next(BC_reader)
for row in reversed(list(BC_reader)):
print row[0]