Read first N mb of a file - python

I am looking to get the first N mb of a file. Here is a basic implementation:
def get_first_n_mb(self, file=None, n=5):
"""
Will return the first 5 (or N) MB of the passed file
"""
file = file or self.file
with open(file, 'rb') as fp:
file_data = self.file_first_n_mb = fp.read(1e6 * n)
return file_data
However, the user may pass a large number, such as n = 1000, in which case we would want to chunk the read. What would be a good 'size' to do the chunk, or would the above approach still work? How could it be improved?

read() is permitted to return less than the amount you asked for. You should call it in a loop until you reach the amount requested or EOF. You need to keep reducing the amount you need to read by the size of the last read.
def get_first_n_mb(self, file=None, n=5):
file = file or self.file
amt = 1e6 * n
file_data = ''
with open(file, 'rb') as fp:
while amt > 0:
try:
block = fp.read(amt)
file_data += block
amt -= len(block)
except EOFError:
break
return file_data
For ordinary files read() will normally return as much as you request, as long as the file is that long. But other types of streams will often return less (e.g. reading from a terminal will usually just return one line).

Related

python3: Read huge file (~ 800 Gb), split the lines by condition, and append them to the end of new files in a memory-efficient way

I’m learning python 3, and I’m dealing with a huge txt file (~800Gb).
The enclosed function 'kmers_dic' while it read the main file, if the condition in if statement is satisfied, it should append the line in the previously created files (these files are 1024 and they are named with content of the kmers variable). The function work fine with a subset of the principal file, but when I run the code using the main file, my job is killed because I reached a memory usage limit.
def OpenFiles(i):
'''
A switch to handle file opening and reduce duplicated code
'''
open_method = {
"gz": gzip.open,
"norm": open
}
return open_method[i]
def rows(f, chunksize=102400, sep='\n'):
"""
Read a file where the row separator is '\n' lazily.
Default chunk size: 102400kB 100Mb.
Usage:
>>> with open('big.csv') as f:
>>> for r in rows(f):
>>> process(r)
"""
curr_row = ''
while True:
chunk = f.read(chunksize)
if chunk == '': # End of file
break
while True:
i = chunk.find(sep)
if i == -1:
break
yield curr_row + chunk[:i]
curr_row = ''
chunk = chunk[i+1:]
curr_row += chunk
def kmers_dic(input_file,kmers,out_dir):
'''
file writing by kmers
'''
#kmers_dic = set()
count_line=0
count_line_1=0
if input_file.endswith('.gz'):
nano_read = OpenFiles('gz')
else:
nano_read = OpenFiles('norm')
with nano_read(input_file, 'rt') as nano_f:
chunk = rows(nano_f,chunksize=2024,sep='\n')
for line in chunk:
count_line+=1
count_line_1+=1
sys.stdout.write('%s\r' % count_line)
sys.stdout.flush()
line = line.strip('\n')
line = line.split()
if line[2] in kmers:
kmer = line[2]
Out_f_name = out_dir+line[2]+'.lib'
file1 = open(Out_f_name, 'a')
##file1.write('\t'.join(line) + '\n') # print entire line
file1.write('\t'.join(line[1:4:]+line[6:9:]+line[9:13:]+line[15:]) + '\n')
file1.close()
print("lines: ",count_line_1)
I'm not understanding where is the issue.
Can you help me ?
Thanks in advance!
Best.
curr_row += chunk causes you keep all chunks in memory until you run out of free memory.

Splitting a large file into multiple other smaller files using a generator

I have a large file that is 143mb in size. I want to split the file into smaller files that are 2.5mb in size, put them into a directory and return the file names. The way I'm attempting to do this is with a generator:
def gen_read(filename, chunk=1024*8):
with open(filename, "rb") as f:
for part in iter(lambda: f.read(chunk), b''):
yield part
The goal is to take this generator and read the file into parts from there write each part into a temporary filename until the file is 2.5mb in size and add to the extension of the temporary file to have a sort of list of them. I'm trying to do so via this function:
API_TEMP_FILE_PATH = "/tmp"
def random_filename(length=10):
s = ""
acceptable = string.ascii_letters
for _ in range(length):
s += random.choice(acceptable)
return s
def split_file(filename, bytes_limit=2621440):
split_files = []
file_ext_number = 1
tmp_filename = random_filename(length=32)
do_break = False
while not do_break:
file_path = "{}/{}_split_file.part_{}".format(API_TEMP_FILE_PATH, tmp_filename, file_ext_number)
stream = gen_read(filename)
for part in next(stream):
if not part:
do_break = True
if os.path.exists(file_path):
size = os.stat(file_path).st_size
if size > bytes_limit:
file_ext_number += 1
with open(file_path, 'wb') as dest:
dest.write(part)
searcher = re.compile('{}\_split\_file\.part\_\d(\d+)?'.format(tmp_filename))
for filename in os.listdir(API_TEMP_FILE_PATH):
if searcher.search(filename) is not None:
split_files.append("{}/{}".format(API_TEMP_FILE_PATH, filename))
return split_files
The issue I'm running into is that my generator is only producing 1 "character" at a time (can be seen by adding print(repr(part)) right underneath the for part):
...
'\x10'
'\x00'
'\x00'
'\x00'
'\x00'
'\x05'
'\x00'
'\x00'
'\x10'
...
As for that, the file size never changes from 1. What am I doing wrong to where this file split function isn't working as expected?
I figured it out, instead of using a generator I just read into the file a certain amount:
def split_file(filename, bytes_limit=2621440):
split_files = []
file_ext_number = 1
tmp_filename = random_filename(None, length=32, is_pcap=False)
file_path = "{}/{}_split_file.part_".format(API_TEMP_FILE_PATH, tmp_filename)
with open(filename, "rb") as source:
byte = source.read(bytes_limit)
while byte:
open(file_path + "{}".format(file_ext_number), 'wb').write(byte)
byte = source.read(bytes_limit)
file_ext_number += 1
searcher = re.compile('{}\_split\_file\.part\_\d(\d+)?'.format(tmp_filename))
for filename in os.listdir(API_TEMP_FILE_PATH):
if searcher.search(filename) is not None:
split_files.append("{}/{}".format(API_TEMP_FILE_PATH, filename))
return split_files
It produces all the correct files

Get filesize and md5 hash in one pass, reading the file only once

How can I open a file, calculate its md5 hash and filesize, while only scanning through the file one time?
Right now I'm doing:
def getMD5Hash(fname):
""" Returns an md5 hash
"""
try:
with open(fname,'rb') as fo:
md5 = hashlib.md5()
chunk_sz = md5.block_size * 128
data = fo.read(chunk_sz)
while data:
md5.update(data)
data = fo.read(chunk_sz)
md5hash = base64.urlsafe_b64encode(md5.digest()).decode('UTF-8').rstrip('=\n')
except IOError:
md5hash = None
return md5hash
size = os.path.getsize(fname)
hash = getMD5Hash(fname)
But, from what I understand, this requires two passes of the file and could be more efficient.
A file does not have to be scanned to get its length. The filesystem knows how big a file is.
If you insist on doing it manually, set size = 0 then do size += len(data) inside your while loop.
Of course your getMD5Hash() is now getMD5Hash_and_size().

Stream multiple files into a readable object in Python

I have a function which processes binary data from a file using file.read(len) method. However, my file is huge and is cut into many smaller files 50 MBytes each. Is there some wrapper class that feeds many files into a buffered stream, and provides a read() method?
Class fileinput.FileInput can do such a thing, but it supports only line-by-line reading (method readline() with no arguments) and does not have read(len) with specifying number of bytes to read.
It's quite easy to concatenate iterables with itertools.chain:
from itertools import chain
def read_by_chunks(file_objects, block_size=1024):
readers = (iter(lambda f=f: f.read(block_size), '') for f in file_objects)
return chain.from_iterable(readers)
You can then do:
for chunk in read_by_chunks([f1, f2, f3, f4], 4096):
handle(chunk)
To process the files in sequence while reading it by chunks of 4096 bytes.
If you need to provide an object with a read method because some other function expects that you can write a very simple wrapper:
class ConcatFiles(object):
def __init__(self, files, block_size):
self._reader = read_by_chunks(files, block_size)
def __iter__(self):
return self._reader
def read(self):
return next(self._reader, '')
This however only uses a fixed block size. It's possible to support the block_size parameter for the read by doing something like:
def read(self, block_size=None):
block_size = block_size or self._block_size
total_read = 0
chunks = []
for chunk in self._reader:
chunks.append(chunk)
total_read += len(chunk)
if total_read > block_size:
contents = ''.join(chunks)
self._reader = chain([contents[block_size:]], self._reader)
return contents[:block_size]
return ''.join(chunks)
Note: if you are reading in binary mode you should replace the empty strings '' in the code with empty bytes b''.
Instead of converting the list of streams into a generator - as some of the other answers do - you can chain the streams together and then use the file interface:
def chain_streams(streams, buffer_size=io.DEFAULT_BUFFER_SIZE):
"""
Chain an iterable of streams together into a single buffered stream.
Usage:
def generate_open_file_streams():
for file in filenames:
yield open(file, 'rb')
f = chain_streams(generate_open_file_streams())
f.read()
"""
class ChainStream(io.RawIOBase):
def __init__(self):
self.leftover = b''
self.stream_iter = iter(streams)
try:
self.stream = next(self.stream_iter)
except StopIteration:
self.stream = None
def readable(self):
return True
def _read_next_chunk(self, max_length):
# Return 0 or more bytes from the current stream, first returning all
# leftover bytes. If the stream is closed returns b''
if self.leftover:
return self.leftover
elif self.stream is not None:
return self.stream.read(max_length)
else:
return b''
def readinto(self, b):
buffer_length = len(b)
chunk = self._read_next_chunk(buffer_length)
while len(chunk) == 0:
# move to next stream
if self.stream is not None:
self.stream.close()
try:
self.stream = next(self.stream_iter)
chunk = self._read_next_chunk(buffer_length)
except StopIteration:
# No more streams to chain together
self.stream = None
return 0 # indicate EOF
output, self.leftover = chunk[:buffer_length], chunk[buffer_length:]
b[:len(output)] = output
return len(output)
return io.BufferedReader(ChainStream(), buffer_size=buffer_size)
Then use it as any other file/stream:
f = chain_streams(open_files_or_chunks)
f.read(len)
I'm not familiar with anything in the standard library that performs that function, so, in case there is none:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
class ConcatenatedFiles( object ):
def __init__(self, file_objects):
self.fds= list(reversed(file_objects))
def read( self, size=None ):
remaining= size
data= StringIO()
while self.fds and (remaining>0 or remaining is None):
data_read= self.fds[-1].read(remaining or -1)
if len(data_read)<remaining or remaining is None: #exhausted file
self.fds.pop()
if not remaining is None:
remaining-=len(data_read)
data.write(data_read)
return data.getvalue()
Another method would be to use a generator:
def read_iter(streams, block_size=1024):
for stream in streams:
for chunk in stream.read(block_size):
yield chunk
# open file handles
file1 = open('f1.txt', 'r')
file2 = open('f2.txt', 'r')
fileOut = open('out.txt', 'w')
# concatenate files 1 & 2
for chunk in read_iter([file1, file2]):
# process chunk (in this case, just concatenate to output)
fileOut.write(chunk)
# close files
file1.close()
file2.close()
fileOut.close()
This shouldn't consume any memory beyond that required by the base script, and the chunk size; it's passing each chunk straight from one file reader, to the writer of another, then repeating until all streams are complete.
If you need this behaviour in a class, this could easily be build into a container class, as Bakuriu describes.

How to read a CSV file in reverse order in Python?

I know how to do it for a TXT file, but now I am having some trouble doing it for a CSV file.
How can I read a CSV file from the bottom in Python?
Pretty much the same way as for a text file: read the whole thing into a list and then go backwards:
import csv
with open('test.csv', 'r') as textfile:
for row in reversed(list(csv.reader(textfile))):
print ', '.join(row)
If you want to get fancy, you could write a lot of code that reads blocks starting at the end of the file and working backwards, emitting a line at a time, and then feed that to csv.reader, but that will only work with a file that can be seeked, i.e. disk files but not standard input.
Some of us have files that do not fit into memory, could anyone come with a solution that does not require storing the entire file in memory?
That's a bit trickier. Luckily, all csv.reader expects is an iterator-like object that returns a string (line) per call to next(). So we grab the technique Darius Bacon presented in "Most efficient way to search the last x lines of a file in python" to read the lines of a file backwards, without having to pull in the whole file:
import os
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
and feed reversed_lines into the code to reverse the lines before they get to csv.reader, removing the need for reversed and list:
import csv
with open('test.csv', 'r') as textfile:
for row in csv.reader(reversed_lines(textfile)):
print ', '.join(row)
There is a more Pythonic solution possible, which doesn't require a character-by-character reversal of the block in memory (hint: just get a list of indices where there are line ends in the block, reverse it, and use it to slice the block), and uses chain out of itertools to glue the line clusters from successive blocks together, but that's left as an exercise for the reader.
It's worth noting that the reversed_lines() idiom above only works if the columns in the CSV file don't contain newlines.
Aargh! There's always something. Luckily, it's not too bad to fix this:
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
quoting = False
for block in reversed_blocks(file):
for c in reversed(block):
if c == '"':
quoting = not quoting
elif c == '\n' and part and not quoting:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
Of course, you'll need to change the quote character if your CSV dialect doesn't use ".
Building on #mike-desimone 's answer. Here's a solution that provides the same structure as a python file object but is read in reverse, line by line:
import os
class ReversedFile(object):
def __init__(self, f, mode='r'):
"""
Wraps a file object with methods that make it be read in reverse line-by-line
if ``f`` is a filename opens a new file object
"""
if mode != 'r':
raise ValueError("ReversedFile only supports read mode (mode='r')")
if not type(f) == file:
# likely a filename
f = open(f)
self.file = f
self.lines = self._reversed_lines()
def _reversed_lines(self):
"Generate the lines of file in reverse order."
part = ''
for block in self._reversed_blocks():
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def _reversed_blocks(self, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file = self.file
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
def __getattribute__(self, name):
"""
Allows for the underlying file attributes to come through
"""
try:
# ReversedFile attribute
return super(ReversedFile, self).__getattribute__(name)
except AttributeError:
# self.file attribute
return getattr(self.file, name)
def __iter__(self):
"""
Creates iterator
"""
return self
def seek(self):
raise NotImplementedError('ReversedFile does not support seek')
def next(self):
"""
Next item in the sequence
"""
return self.lines.next()
def read(self):
"""
Returns the entire contents of the file reversed line by line
"""
contents = ''
for line in self:
contents += line
return contents
def readline(self):
"""
Returns the next line from the bottom
"""
return self.next()
def readlines(self):
"""
Returns all remaining lines from the bottom of the file in reverse
"""
return [x for x in self]
Go for it. This is simple program to reverse the rows from a CSV file.
import csv
BC_file = open('Master.csv', 'rb')
BC_reader = csv.reader(BC_file)
next(BC_reader)
for row in reversed(list(BC_reader)):
print row[0]

Categories

Resources