Reading only x amount of bytes from a binary file

Reading only x amount of bytes from a binary file - python

def streamView(request):
# zip_path = './pages/data.zip'
zip_path = './pages/data/test.txt'
start_index = 0
end_index = 50
with open(zip_path, 'rb', 50) as fin:
fin.seek(start_index)
data = fin.read(end_index - start_index)
response = HttpResponse(fin, content_type="application/zip")
response["Content-Disposition"] = f"attachment; filename={zip_path}"
return response
So I'm using the above code to read just the first 50 bits of a file and return them. However, this just returns the entire file to the user.
Ideally, I would like to send start and end parameters to this function, and then just read between those two points, and send that data.
How can I acheive this?

Related

Splitting a large file into multiple other smaller files using a generator

I have a large file that is 143mb in size. I want to split the file into smaller files that are 2.5mb in size, put them into a directory and return the file names. The way I'm attempting to do this is with a generator:
def gen_read(filename, chunk=1024*8):
with open(filename, "rb") as f:
for part in iter(lambda: f.read(chunk), b''):
yield part
The goal is to take this generator and read the file into parts from there write each part into a temporary filename until the file is 2.5mb in size and add to the extension of the temporary file to have a sort of list of them. I'm trying to do so via this function:
API_TEMP_FILE_PATH = "/tmp"
def random_filename(length=10):
s = ""
acceptable = string.ascii_letters
for _ in range(length):
s += random.choice(acceptable)
return s
def split_file(filename, bytes_limit=2621440):
split_files = []
file_ext_number = 1
tmp_filename = random_filename(length=32)
do_break = False
while not do_break:
file_path = "{}/{}_split_file.part_{}".format(API_TEMP_FILE_PATH, tmp_filename, file_ext_number)
stream = gen_read(filename)
for part in next(stream):
if not part:
do_break = True
if os.path.exists(file_path):
size = os.stat(file_path).st_size
if size > bytes_limit:
file_ext_number += 1
with open(file_path, 'wb') as dest:
dest.write(part)
searcher = re.compile('{}\_split\_file\.part\_\d(\d+)?'.format(tmp_filename))
for filename in os.listdir(API_TEMP_FILE_PATH):
if searcher.search(filename) is not None:
split_files.append("{}/{}".format(API_TEMP_FILE_PATH, filename))
return split_files
The issue I'm running into is that my generator is only producing 1 "character" at a time (can be seen by adding print(repr(part)) right underneath the for part):
...
'\x10'
'\x00'
'\x00'
'\x00'
'\x00'
'\x05'
'\x00'
'\x00'
'\x10'
...
As for that, the file size never changes from 1. What am I doing wrong to where this file split function isn't working as expected?

I figured it out, instead of using a generator I just read into the file a certain amount:
def split_file(filename, bytes_limit=2621440):
split_files = []
file_ext_number = 1
tmp_filename = random_filename(None, length=32, is_pcap=False)
file_path = "{}/{}_split_file.part_".format(API_TEMP_FILE_PATH, tmp_filename)
with open(filename, "rb") as source:
byte = source.read(bytes_limit)
while byte:
open(file_path + "{}".format(file_ext_number), 'wb').write(byte)
byte = source.read(bytes_limit)
file_ext_number += 1
searcher = re.compile('{}\_split\_file\.part\_\d(\d+)?'.format(tmp_filename))
for filename in os.listdir(API_TEMP_FILE_PATH):
if searcher.search(filename) is not None:
split_files.append("{}/{}".format(API_TEMP_FILE_PATH, filename))
return split_files
It produces all the correct files

Pick line in csv file to pull data

I have a code that is working for me in a Cinema 4d project. It is able to read 4 different data points and kick out outputs to the main project. Currently it is reading all of the lines of the csv file and I would like to pick one line and pull the data from that line only.
import c4d
import csv
def main():
path = Spreadsheet #Spreadsheet is an input filename path
with open(path, 'rb') as csv_file:
readed = csv.DictReader(csv_file,delimiter=',')
for i, row in enumerate(readed):
try:
Xcord = float(row["hc_x"])
Ycord = float(row["hc_y"])
Langle = float(row["launch_angle"])
Lspeed = float(row["launch_speed"])
except:
print "Error while reading - {}".format(row)
continue
global Output1
global Output2
global Output3
global Output4
Output1 = Xcord
Output2 = Ycord
Output3 = Langle
Output4 = Lspeed
This is about the first thing I have tried to code. So thanks.

csv.DictReader requires that you open the file with newline="" in order for it to parse the file correctly.
with open(path, 'rb', newline="") as csv_file:
readed = csv.DictReader(csv_file,delimiter=',')
You also don't have any condition to stop reading the file.
row_to_stop = 5
for i, row in enumerate(readed):
if i == row_to_stop:
Xcord = float(row["hc_x"])
Ycord = float(row["hc_y"])
Langle = float(row["launch_angle"])
Lspeed = float(row["launch_speed"])
break
If you only care about one line, don't look up and type cast values until you reach the line you care about.

I would like to pick one line and pull the data from that line only
The code below will return specific line (by index). You will have to split it and grab the data.
def get_interesting_line(file_name: str, idx: int):
cnt = 0
with open(file_name) as f:
while cnt != idx:
f.readline()
cnt += 1
return f.readline().strip()
# usage example below
print(get_interesting_line('data.txt',7))

Read first N mb of a file

I am looking to get the first N mb of a file. Here is a basic implementation:
def get_first_n_mb(self, file=None, n=5):
"""
Will return the first 5 (or N) MB of the passed file
"""
file = file or self.file
with open(file, 'rb') as fp:
file_data = self.file_first_n_mb = fp.read(1e6 * n)
return file_data
However, the user may pass a large number, such as n = 1000, in which case we would want to chunk the read. What would be a good 'size' to do the chunk, or would the above approach still work? How could it be improved?

read() is permitted to return less than the amount you asked for. You should call it in a loop until you reach the amount requested or EOF. You need to keep reducing the amount you need to read by the size of the last read.
def get_first_n_mb(self, file=None, n=5):
file = file or self.file
amt = 1e6 * n
file_data = ''
with open(file, 'rb') as fp:
while amt > 0:
try:
block = fp.read(amt)
file_data += block
amt -= len(block)
except EOFError:
break
return file_data
For ordinary files read() will normally return as much as you request, as long as the file is that long. But other types of streams will often return less (e.g. reading from a terminal will usually just return one line).

Conflict Between Reaching EOF and Using Unpack

My error is that unpack requires a string argument of length 1, but the script will not return such an argument when it reaches the end of the file. How do I still reach the end of the file, while converting binary data to int data, without having that error pop up?
ecgSS = []
ecgFB = []
try:
print("Beginning snipping of ECG data from the holter file...")
#Get size of file in bytes
file_size = os.path.getsize(args.filename)
#Read holter file into memory
holter = open(args.filename, 'rb')
ecgCount = 0
while ecgCount <= file_size:
packetID = struct.unpack('B', holter.read(1))[0]
packetSS = struct.unpack('H', holter.read(2))[0]
packetFB = struct.unpack('H', holter.read(2))[0]
if(packetID == 0):
ecgCount += 1
ecgSS.append(packetSS)
ecgFB.append(packetFB)
#Close the file stream
holter.close()

You have to make sure that the file has enough data before reading. For each iteration of the while loop, you are reading 5 bytes, so you have to make sure that there are at least 5 bytes before you read. In addition the count must be incremented by 5 after each read.
A simple fix will be to change the loop to
while ecgCount < file_size/5:
With that fix, you also need to use two counters. One for the number of data in the file and one for the valid data in the file. As I see, you seem to account only for data with packetID==0 that is a type of validation. You need a different counter for that one. Let say validCount, your program will then look like:
ecgSS = []
ecgFB = []
try:
print("Beginning snipping of ECG data from the holter file...")
#Get size of file in bytes
file_size = os.path.getsize(args.filename)
#Read holter file into memory
holter = open(args.filename, 'rb')
ecgCount = 0
validCount = 0
while ecgCount < file_size/5:
packetID = struct.unpack('B', holter.read(1))[0]
packetSS = struct.unpack('H', holter.read(2))[0]
packetFB = struct.unpack('H', holter.read(2))[0]
ecgCount += 1
if(packetID == 0):
validCount += 1
ecgSS.append(packetSS)
ecgFB.append(packetFB)
#Close the file stream
holter.close()

Get filesize and md5 hash in one pass, reading the file only once

How can I open a file, calculate its md5 hash and filesize, while only scanning through the file one time?
Right now I'm doing:
def getMD5Hash(fname):
""" Returns an md5 hash
"""
try:
with open(fname,'rb') as fo:
md5 = hashlib.md5()
chunk_sz = md5.block_size * 128
data = fo.read(chunk_sz)
while data:
md5.update(data)
data = fo.read(chunk_sz)
md5hash = base64.urlsafe_b64encode(md5.digest()).decode('UTF-8').rstrip('=\n')
except IOError:
md5hash = None
return md5hash
size = os.path.getsize(fname)
hash = getMD5Hash(fname)
But, from what I understand, this requires two passes of the file and could be more efficient.

A file does not have to be scanned to get its length. The filesystem knows how big a file is.
If you insist on doing it manually, set size = 0 then do size += len(data) inside your while loop.
Of course your getMD5Hash() is now getMD5Hash_and_size().

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Reading only x amount of bytes from a binary file - python

Related

Splitting a large file into multiple other smaller files using a generator

Pick line in csv file to pull data

Read first N mb of a file

Conflict Between Reaching EOF and Using Unpack

Get filesize and md5 hash in one pass, reading the file only once

Categories

Resources