Code
import sys
import os
fp = open("/home/masi/r3.raw", "rb")
try:
events = []
while aBuf[:4] != b'\xFA\xFA\xFA\xFA':
aBuf = fp.read(4)
events.append(aBuf)
if aBuf == os.SEEK_END:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[os.SEEK_END - 144 : os.SEEK_END]
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
fp.close()
where I know that the following is never true
if aBuf == os.SEEK_END:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[os.SEEK_END - 144 : os.SEEK_END]
I am comparing the pointer with the end pointer of the file, at least I am expecting so but it does not seem to correct.
Improved Code from skrrgwasme and martineau's contributions
import sys
import os
import struct
import binascii
file_name = "/home/masi/r.raw"
file_size = os.path.getsize(file_name)
print "File size is : ", file_size
read_size = 4
read_count = 0
aBuf = b'\x00\x00\x00\x00' # don't forget to create your variables before you try to read from them
fileEnding = ""
fp = open(file_name, "rb")
try:
aBuf = fp.read(read_size)
read_count += read_size
event_starts = []
event_ends = []
event_starts.append(read_count)
while aBuf and read_count < file_size:
if aBuf[:read_size] == b'\xFA\xFA\xFA\xFA':
event_ends.append(read_count)
if read_count + 1 < file_size: event_starts.append(read_count + 1)
aBuf = fp.read(read_size)
read_count += read_size
print "RC ", read_count, ", remaining: ", 1.0-float(read_count)/file_size, "%"
if read_count >= file_size: break
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
# store to partial index of postgres database: event pointers
fp.close()
How can you compare location of two pointers?
If you take a look at the Python source code for the os module, you'll see that os.SEEK_END isn't automatically set to the size of your file. It's just a constant that is set equal to the integer 2. It is intended to be used as a parameter for the lseek() function.
You need to get the file size in bytes first, then compare your file pointer to that. You can use os.path.getsize(path) to get your file size in bytes. Your comparison was never true because you were reading four bytes at a time, so your file pointer skipped from byte 0 to byte 4, passing over 2, which is the value of os.SEEK_END.
Suggested code:
import sys
import os
file_name = "/home/masi/r3.raw"
file_size = os.path.getsize(file_name)
read_size = 4
read_count = 0
# you could use fp.tell() in the loop instead of manually incrementing
# your own count of the file position instead, but this will avoid a lot of
# extra fp.tell() calls in the loop
aBuf = b'\x00\x00\x00\x00' # don't forget to create your variables before you try to
# read from them
fp = open(file_name, "rb")
try:
events = []
while aBuf[:read_size] != b'\xFA\xFA\xFA\xFA':
aBuf = fp.read(read_size)
events.append(aBuf)
read_count += read_size
if read_count >= file_size:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[file_size - 144 : file_size]
break
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
fp.close()
Notes:
Instead of comparing for exactly the file size you expect, I suggest using a greater than or equal comparison (>=). Since you're reading four bytes at a time, if you have an odd file size, your comparison will never be true.
After you get this code working, I'd suggest taking it over to Code Review Stack Exchange. As martineau has helpfully pointed out in the comments, there are a number of issues and potential pitfalls in your code that are worth correcting.
Related
I'm a beginner in coding and am trying to build a script that takes a txt file as an input, hash it and output to another txt file containing "string:hashedstring" in each line of it. The code is working properly. The problem I am facing now is that if the input file is big, it will consume all RAM and kill it. I tried to use chunks, but couldn't figure out how to use it with multiline input and output.
Any suggestions regarding other parts of the code other than the main subject here is very welcome, since I am just starting on this. Thanks.
import argparse
import hashlib
import os
import sys
def sofia_hash(msg):
h = ""
m = hashlib.md5()
m.update(msg.encode('utf-8'))
msg_md5 = m.digest()
for i in range(8):
n = (msg_md5[2*i] + msg_md5[2*i+1]) % 0x3e
if n > 9:
if n > 35:
n += 61
else:
n += 55
else:
n += 0x30
h += chr(n)
return h
top_parser = argparse.ArgumentParser(description='Sofiamass')
top_parser.add_argument('input', action="store", type=argparse.FileType('r', encoding='utf8'), help="Set input file")
top_parser.add_argument('output', action="store", help="Set output file")
args = top_parser.parse_args()
sofiainput = args.input.read().splitlines()
a = 0
try:
while a < len(sofiainput):
target_sofiainput = sofiainput[a]
etarget_sofiainput = (target_sofiainput).encode('utf-8')
try:
sofia_pass = sofia_hash(target_sofiainput)
x = True
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
if x == True:
with open(args.output, 'a') as sofiaoutput:
sofiaoutput.write(str(target_sofiainput) + ":" + str(sofia_pass) + "\n")
elif x == False:
print('error')
a += 1
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
except AttributeError:
pass
When you open the file with the open command, it creates a object called file handler. So, when you do:
with open('filepath.txt', 'r') as f:
for line in f:
print(line)
it only keeps the current line you are using in the RAM, thus achieving your objective to use as little as RAM as possible.
I have this test code which does the following:
Write a test message to a file > Barrier > Read the test message > Assert equal > Repeat.
from __future__ import print_function
import os
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
loop = True
def main():
global loop
txt_write = 'buhahaha'
with open('test', 'w') as f1:
if rank == 0:
f1.write(txt_write)
f1.flush()
os.fsync(f1.fileno())
comm.barrier()
with open('test') as f2:
txt_read = f2.read()
try:
assert txt_read == txt_write
except:
print("Assertion error", txt_read, "!=", txt_write, 'rank=', rank)
loop = False
finally:
comm.barrier()
if rank == 0:
os.remove('test')
if __name__ == '__main__':
i = 0
while loop:
main()
if i % 1000 == 0 and rank == 0:
print("Iterations:", i)
i += 1
It works for a few 100 or 1000 iterations, but then at one point it reads an empty file and the assertion fails. Other answers had recommended use of flush and os.fsync, but that does not seem to help - it just makes the execution slower. Any idea how to fix this?
Maybe you can try something like this, instead:
if rank == 0:
with open('test', 'w') as f1:
f1.write(txt_write)
# as #jschultz410 correctly pointed out,
# we remove f1.flush() and f1.close()
comm.barrier()
with open('test') as f2:
txt_read = f2.read()
The code resulted in a race condition where all processes were opening the same file simultaneously. Thanks to #jschultz410 and #mko for identifying this logical error.
My solution for the code was to use a memory stream instead of a real file. Now, the open, write and read parts of the code becomes:
from io import StringIO
f1 = StringIO()
if rank == 0:
f1.write(txt_write)
f1.flush()
comm.barrier()
txt_read = f1.getvalue()
My error is that unpack requires a string argument of length 1, but the script will not return such an argument when it reaches the end of the file. How do I still reach the end of the file, while converting binary data to int data, without having that error pop up?
ecgSS = []
ecgFB = []
try:
print("Beginning snipping of ECG data from the holter file...")
#Get size of file in bytes
file_size = os.path.getsize(args.filename)
#Read holter file into memory
holter = open(args.filename, 'rb')
ecgCount = 0
while ecgCount <= file_size:
packetID = struct.unpack('B', holter.read(1))[0]
packetSS = struct.unpack('H', holter.read(2))[0]
packetFB = struct.unpack('H', holter.read(2))[0]
if(packetID == 0):
ecgCount += 1
ecgSS.append(packetSS)
ecgFB.append(packetFB)
#Close the file stream
holter.close()
You have to make sure that the file has enough data before reading. For each iteration of the while loop, you are reading 5 bytes, so you have to make sure that there are at least 5 bytes before you read. In addition the count must be incremented by 5 after each read.
A simple fix will be to change the loop to
while ecgCount < file_size/5:
With that fix, you also need to use two counters. One for the number of data in the file and one for the valid data in the file. As I see, you seem to account only for data with packetID==0 that is a type of validation. You need a different counter for that one. Let say validCount, your program will then look like:
ecgSS = []
ecgFB = []
try:
print("Beginning snipping of ECG data from the holter file...")
#Get size of file in bytes
file_size = os.path.getsize(args.filename)
#Read holter file into memory
holter = open(args.filename, 'rb')
ecgCount = 0
validCount = 0
while ecgCount < file_size/5:
packetID = struct.unpack('B', holter.read(1))[0]
packetSS = struct.unpack('H', holter.read(2))[0]
packetFB = struct.unpack('H', holter.read(2))[0]
ecgCount += 1
if(packetID == 0):
validCount += 1
ecgSS.append(packetSS)
ecgFB.append(packetFB)
#Close the file stream
holter.close()
The error occurs at line 49 "fileSizeRemainingInBytes = os.path.getsize(inFile)"
inFile contains the file I want to gets size. From what I understood in the python documentation this should be correct. Can someone tell me what is the problem.
import sys, os
buffer = 1000
try:
#open file in binary mode for reading
inFile = open(sys.argv[1],"rb")
print "file name is: ", inFile.name
except IOError:
#check for IOExceptions
print "Eror opening file"
sys.exit()
else:
#create new directory for copying, create out file in new directory
if (os.path.isdir("recv")):
os.chdir("recv")
try:
outFile = open(inFile.name,"wb")
except IOError:
print "something went wrong creating the out file"
sys.exit()
else :
os.mkdir("recv")
os.chdir("recv")
try:
outFile = open(inFile.name,"wb")
except IOError:
print "something went wrong creating the out file"
sys.exit()
#loop to copy bytes to new directory
fileSizeRemainingInBytes = os.path.getsize(inFile)
print "Initial size: ", fileSizeRemainingInBytes
while fileSizeRemainingInBytes > 0 :
print fileSizeRemainingInBytes
bytesToCopy = inFile.read(buffer);
outFile.write(bytesToCopy);
inFile.close()
os.path.getsize takes a file path as an argument, not a file object. So you actually want to call os.path.getsize(inFile.name). Note that this won't give you the number of bytes remaining to copy; it'll just give you the size of the whole file every time it's evaluated. To get the number of bytes remaining, you'll have to keep track of the total number of bytes read and subtract this total from the file size.
Something like this should work:
import sys
import os
buffer = 1000
with open(sys.argv[1], "rb") as in_file:
# Make your `recv` directory as a sub-directory
# or your current directory if it doesn't already exist
if not os.path.isdir("recv"):
os.mkdir("recv")
# Create the path to the file to which you
# want to copy. When opened, you'll have a file
# with the same file name as your input file,
# but it will be in your `recv` subdirectory
out_file_path = os.path.join("recv", in_file.name)
# Read the bytes
with open(out_file_path, "wb") as out_file:
bytes_read = 0
bytes_to_read = os.path.getsize(in_file.name)
while bytes_read < bytes_to_read:
out_file.write(in_file.read(buffer))
bytes_read += min(buffer, bytes_to_read - bytes_read)
print "{} / {} bytes copied".format(bytes_read, bytes_to_read)
I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-
I've to open the file
then using readline() have to read the file line by line and write at the same time to a new file
and as soon as it hits the maximum number of lines it will create another file and
starts writing again.
I'm wondering, how to put all these steps together in a memory efficient and faster way. I've seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.
This working solution uses split command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.
First, I created a test file with 1000M entries (15 GB) with
awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
Then I used split:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
It took 5 min to produce a set of 34 small files with names t00-t33. 33 files are 458 MB each and the last t33 is 153 MB.
from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)
I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}\n'.format(line))
If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.
This class may solve your problem.
I've tested it on Linux and Windows operating system, and it's worked perfectly on both of them.
Also, I've tested binary and text file with different sizes each time and it was great.
Enjoy :)
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "\r\n" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("\rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()
I am writing a Python3 code solution which I usually use to split files having size in MBs.
However, I have not yet tried for files having size in GBs.
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increment file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"\n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :
1.txt
2.txt
3.txt
.
.
.
.
n.txt