Conflict Between Reaching EOF and Using Unpack - python

My error is that unpack requires a string argument of length 1, but the script will not return such an argument when it reaches the end of the file. How do I still reach the end of the file, while converting binary data to int data, without having that error pop up?
ecgSS = []
ecgFB = []
try:
print("Beginning snipping of ECG data from the holter file...")
#Get size of file in bytes
file_size = os.path.getsize(args.filename)
#Read holter file into memory
holter = open(args.filename, 'rb')
ecgCount = 0
while ecgCount <= file_size:
packetID = struct.unpack('B', holter.read(1))[0]
packetSS = struct.unpack('H', holter.read(2))[0]
packetFB = struct.unpack('H', holter.read(2))[0]
if(packetID == 0):
ecgCount += 1
ecgSS.append(packetSS)
ecgFB.append(packetFB)
#Close the file stream
holter.close()

You have to make sure that the file has enough data before reading. For each iteration of the while loop, you are reading 5 bytes, so you have to make sure that there are at least 5 bytes before you read. In addition the count must be incremented by 5 after each read.
A simple fix will be to change the loop to
while ecgCount < file_size/5:
With that fix, you also need to use two counters. One for the number of data in the file and one for the valid data in the file. As I see, you seem to account only for data with packetID==0 that is a type of validation. You need a different counter for that one. Let say validCount, your program will then look like:
ecgSS = []
ecgFB = []
try:
print("Beginning snipping of ECG data from the holter file...")
#Get size of file in bytes
file_size = os.path.getsize(args.filename)
#Read holter file into memory
holter = open(args.filename, 'rb')
ecgCount = 0
validCount = 0
while ecgCount < file_size/5:
packetID = struct.unpack('B', holter.read(1))[0]
packetSS = struct.unpack('H', holter.read(2))[0]
packetFB = struct.unpack('H', holter.read(2))[0]
ecgCount += 1
if(packetID == 0):
validCount += 1
ecgSS.append(packetSS)
ecgFB.append(packetFB)
#Close the file stream
holter.close()

Related

python3: Read huge file (~ 800 Gb), split the lines by condition, and append them to the end of new files in a memory-efficient way

I’m learning python 3, and I’m dealing with a huge txt file (~800Gb).
The enclosed function 'kmers_dic' while it read the main file, if the condition in if statement is satisfied, it should append the line in the previously created files (these files are 1024 and they are named with content of the kmers variable). The function work fine with a subset of the principal file, but when I run the code using the main file, my job is killed because I reached a memory usage limit.
def OpenFiles(i):
'''
A switch to handle file opening and reduce duplicated code
'''
open_method = {
"gz": gzip.open,
"norm": open
}
return open_method[i]
def rows(f, chunksize=102400, sep='\n'):
"""
Read a file where the row separator is '\n' lazily.
Default chunk size: 102400kB 100Mb.
Usage:
>>> with open('big.csv') as f:
>>> for r in rows(f):
>>> process(r)
"""
curr_row = ''
while True:
chunk = f.read(chunksize)
if chunk == '': # End of file
break
while True:
i = chunk.find(sep)
if i == -1:
break
yield curr_row + chunk[:i]
curr_row = ''
chunk = chunk[i+1:]
curr_row += chunk
def kmers_dic(input_file,kmers,out_dir):
'''
file writing by kmers
'''
#kmers_dic = set()
count_line=0
count_line_1=0
if input_file.endswith('.gz'):
nano_read = OpenFiles('gz')
else:
nano_read = OpenFiles('norm')
with nano_read(input_file, 'rt') as nano_f:
chunk = rows(nano_f,chunksize=2024,sep='\n')
for line in chunk:
count_line+=1
count_line_1+=1
sys.stdout.write('%s\r' % count_line)
sys.stdout.flush()
line = line.strip('\n')
line = line.split()
if line[2] in kmers:
kmer = line[2]
Out_f_name = out_dir+line[2]+'.lib'
file1 = open(Out_f_name, 'a')
##file1.write('\t'.join(line) + '\n') # print entire line
file1.write('\t'.join(line[1:4:]+line[6:9:]+line[9:13:]+line[15:]) + '\n')
file1.close()
print("lines: ",count_line_1)
I'm not understanding where is the issue.
Can you help me ?
Thanks in advance!
Best.
curr_row += chunk causes you keep all chunks in memory until you run out of free memory.

Splitting a large file into multiple other smaller files using a generator

I have a large file that is 143mb in size. I want to split the file into smaller files that are 2.5mb in size, put them into a directory and return the file names. The way I'm attempting to do this is with a generator:
def gen_read(filename, chunk=1024*8):
with open(filename, "rb") as f:
for part in iter(lambda: f.read(chunk), b''):
yield part
The goal is to take this generator and read the file into parts from there write each part into a temporary filename until the file is 2.5mb in size and add to the extension of the temporary file to have a sort of list of them. I'm trying to do so via this function:
API_TEMP_FILE_PATH = "/tmp"
def random_filename(length=10):
s = ""
acceptable = string.ascii_letters
for _ in range(length):
s += random.choice(acceptable)
return s
def split_file(filename, bytes_limit=2621440):
split_files = []
file_ext_number = 1
tmp_filename = random_filename(length=32)
do_break = False
while not do_break:
file_path = "{}/{}_split_file.part_{}".format(API_TEMP_FILE_PATH, tmp_filename, file_ext_number)
stream = gen_read(filename)
for part in next(stream):
if not part:
do_break = True
if os.path.exists(file_path):
size = os.stat(file_path).st_size
if size > bytes_limit:
file_ext_number += 1
with open(file_path, 'wb') as dest:
dest.write(part)
searcher = re.compile('{}\_split\_file\.part\_\d(\d+)?'.format(tmp_filename))
for filename in os.listdir(API_TEMP_FILE_PATH):
if searcher.search(filename) is not None:
split_files.append("{}/{}".format(API_TEMP_FILE_PATH, filename))
return split_files
The issue I'm running into is that my generator is only producing 1 "character" at a time (can be seen by adding print(repr(part)) right underneath the for part):
...
'\x10'
'\x00'
'\x00'
'\x00'
'\x00'
'\x05'
'\x00'
'\x00'
'\x10'
...
As for that, the file size never changes from 1. What am I doing wrong to where this file split function isn't working as expected?
I figured it out, instead of using a generator I just read into the file a certain amount:
def split_file(filename, bytes_limit=2621440):
split_files = []
file_ext_number = 1
tmp_filename = random_filename(None, length=32, is_pcap=False)
file_path = "{}/{}_split_file.part_".format(API_TEMP_FILE_PATH, tmp_filename)
with open(filename, "rb") as source:
byte = source.read(bytes_limit)
while byte:
open(file_path + "{}".format(file_ext_number), 'wb').write(byte)
byte = source.read(bytes_limit)
file_ext_number += 1
searcher = re.compile('{}\_split\_file\.part\_\d(\d+)?'.format(tmp_filename))
for filename in os.listdir(API_TEMP_FILE_PATH):
if searcher.search(filename) is not None:
split_files.append("{}/{}".format(API_TEMP_FILE_PATH, filename))
return split_files
It produces all the correct files

Reading only x amount of bytes from a binary file

def streamView(request):
# zip_path = './pages/data.zip'
zip_path = './pages/data/test.txt'
start_index = 0
end_index = 50
with open(zip_path, 'rb', 50) as fin:
fin.seek(start_index)
data = fin.read(end_index - start_index)
response = HttpResponse(fin, content_type="application/zip")
response["Content-Disposition"] = f"attachment; filename={zip_path}"
return response
So I'm using the above code to read just the first 50 bits of a file and return them. However, this just returns the entire file to the user.
Ideally, I would like to send start and end parameters to this function, and then just read between those two points, and send that data.
How can I acheive this?

To compare two pointer locations in Python

Code
import sys
import os
fp = open("/home/masi/r3.raw", "rb")
try:
events = []
while aBuf[:4] != b'\xFA\xFA\xFA\xFA':
aBuf = fp.read(4)
events.append(aBuf)
if aBuf == os.SEEK_END:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[os.SEEK_END - 144 : os.SEEK_END]
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
fp.close()
where I know that the following is never true
if aBuf == os.SEEK_END:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[os.SEEK_END - 144 : os.SEEK_END]
I am comparing the pointer with the end pointer of the file, at least I am expecting so but it does not seem to correct.
Improved Code from skrrgwasme and martineau's contributions
import sys
import os
import struct
import binascii
file_name = "/home/masi/r.raw"
file_size = os.path.getsize(file_name)
print "File size is : ", file_size
read_size = 4
read_count = 0
aBuf = b'\x00\x00\x00\x00' # don't forget to create your variables before you try to read from them
fileEnding = ""
fp = open(file_name, "rb")
try:
aBuf = fp.read(read_size)
read_count += read_size
event_starts = []
event_ends = []
event_starts.append(read_count)
while aBuf and read_count < file_size:
if aBuf[:read_size] == b'\xFA\xFA\xFA\xFA':
event_ends.append(read_count)
if read_count + 1 < file_size: event_starts.append(read_count + 1)
aBuf = fp.read(read_size)
read_count += read_size
print "RC ", read_count, ", remaining: ", 1.0-float(read_count)/file_size, "%"
if read_count >= file_size: break
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
# store to partial index of postgres database: event pointers
fp.close()
How can you compare location of two pointers?
If you take a look at the Python source code for the os module, you'll see that os.SEEK_END isn't automatically set to the size of your file. It's just a constant that is set equal to the integer 2. It is intended to be used as a parameter for the lseek() function.
You need to get the file size in bytes first, then compare your file pointer to that. You can use os.path.getsize(path) to get your file size in bytes. Your comparison was never true because you were reading four bytes at a time, so your file pointer skipped from byte 0 to byte 4, passing over 2, which is the value of os.SEEK_END.
Suggested code:
import sys
import os
file_name = "/home/masi/r3.raw"
file_size = os.path.getsize(file_name)
read_size = 4
read_count = 0
# you could use fp.tell() in the loop instead of manually incrementing
# your own count of the file position instead, but this will avoid a lot of
# extra fp.tell() calls in the loop
aBuf = b'\x00\x00\x00\x00' # don't forget to create your variables before you try to
# read from them
fp = open(file_name, "rb")
try:
events = []
while aBuf[:read_size] != b'\xFA\xFA\xFA\xFA':
aBuf = fp.read(read_size)
events.append(aBuf)
read_count += read_size
if read_count >= file_size:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[file_size - 144 : file_size]
break
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
fp.close()
Notes:
Instead of comparing for exactly the file size you expect, I suggest using a greater than or equal comparison (>=). Since you're reading four bytes at a time, if you have an odd file size, your comparison will never be true.
After you get this code working, I'd suggest taking it over to Code Review Stack Exchange. As martineau has helpfully pointed out in the comments, there are a number of issues and potential pitfalls in your code that are worth correcting.

Split large text file(around 50GB) into multiple files

I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-
I've to open the file
then using readline() have to read the file line by line and write at the same time to a new file
and as soon as it hits the maximum number of lines it will create another file and
starts writing again.
I'm wondering, how to put all these steps together in a memory efficient and faster way. I've seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.
This working solution uses split command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.
First, I created a test file with 1000M entries (15 GB) with
awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
Then I used split:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
It took 5 min to produce a set of 34 small files with names t00-t33. 33 files are 458 MB each and the last t33 is 153 MB.
from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)
I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}\n'.format(line))
If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.
This class may solve your problem.
I've tested it on Linux and Windows operating system, and it's worked perfectly on both of them.
Also, I've tested binary and text file with different sizes each time and it was great.
Enjoy :)
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "\r\n" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("\rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()
I am writing a Python3 code solution which I usually use to split files having size in MBs.
However, I have not yet tried for files having size in GBs.
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increment file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"\n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :
1.txt
2.txt
3.txt
.
.
.
.
n.txt

Categories

Resources