MPI barrier not blocking file write, flush and os.fsync - python

I have this test code which does the following:
Write a test message to a file > Barrier > Read the test message > Assert equal > Repeat.
from __future__ import print_function
import os
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
loop = True
def main():
global loop
txt_write = 'buhahaha'
with open('test', 'w') as f1:
if rank == 0:
f1.write(txt_write)
f1.flush()
os.fsync(f1.fileno())
comm.barrier()
with open('test') as f2:
txt_read = f2.read()
try:
assert txt_read == txt_write
except:
print("Assertion error", txt_read, "!=", txt_write, 'rank=', rank)
loop = False
finally:
comm.barrier()
if rank == 0:
os.remove('test')
if __name__ == '__main__':
i = 0
while loop:
main()
if i % 1000 == 0 and rank == 0:
print("Iterations:", i)
i += 1
It works for a few 100 or 1000 iterations, but then at one point it reads an empty file and the assertion fails. Other answers had recommended use of flush and os.fsync, but that does not seem to help - it just makes the execution slower. Any idea how to fix this?

Maybe you can try something like this, instead:
if rank == 0:
with open('test', 'w') as f1:
f1.write(txt_write)
# as #jschultz410 correctly pointed out,
# we remove f1.flush() and f1.close()
comm.barrier()
with open('test') as f2:
txt_read = f2.read()

The code resulted in a race condition where all processes were opening the same file simultaneously. Thanks to #jschultz410 and #mko for identifying this logical error.
My solution for the code was to use a memory stream instead of a real file. Now, the open, write and read parts of the code becomes:
from io import StringIO
f1 = StringIO()
if rank == 0:
f1.write(txt_write)
f1.flush()
comm.barrier()
txt_read = f1.getvalue()

Related

Hashing wordlist with big input output files in Python 3.8

I'm a beginner in coding and am trying to build a script that takes a txt file as an input, hash it and output to another txt file containing "string:hashedstring" in each line of it. The code is working properly. The problem I am facing now is that if the input file is big, it will consume all RAM and kill it. I tried to use chunks, but couldn't figure out how to use it with multiline input and output.
Any suggestions regarding other parts of the code other than the main subject here is very welcome, since I am just starting on this. Thanks.
import argparse
import hashlib
import os
import sys
def sofia_hash(msg):
h = ""
m = hashlib.md5()
m.update(msg.encode('utf-8'))
msg_md5 = m.digest()
for i in range(8):
n = (msg_md5[2*i] + msg_md5[2*i+1]) % 0x3e
if n > 9:
if n > 35:
n += 61
else:
n += 55
else:
n += 0x30
h += chr(n)
return h
top_parser = argparse.ArgumentParser(description='Sofiamass')
top_parser.add_argument('input', action="store", type=argparse.FileType('r', encoding='utf8'), help="Set input file")
top_parser.add_argument('output', action="store", help="Set output file")
args = top_parser.parse_args()
sofiainput = args.input.read().splitlines()
a = 0
try:
while a < len(sofiainput):
target_sofiainput = sofiainput[a]
etarget_sofiainput = (target_sofiainput).encode('utf-8')
try:
sofia_pass = sofia_hash(target_sofiainput)
x = True
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
if x == True:
with open(args.output, 'a') as sofiaoutput:
sofiaoutput.write(str(target_sofiainput) + ":" + str(sofia_pass) + "\n")
elif x == False:
print('error')
a += 1
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
except AttributeError:
pass
When you open the file with the open command, it creates a object called file handler. So, when you do:
with open('filepath.txt', 'r') as f:
for line in f:
print(line)
it only keeps the current line you are using in the RAM, thus achieving your objective to use as little as RAM as possible.

Python: Writing to file using for loop

Using this Python code I get printed lines of file in UPPERCASE but file remains unchanged (lowercase.)
def open_f():
while True:
fname=raw_input("Enter filename:")
if fname != "done":
try:
fhand=open(fname, "r+")
break
except:
print "WRONG!!!"
continue
else: exit()
return fhand
fhand=open_f()
for line in fhand:
ss=line.upper().strip()
print ss
fhand.write(ss)
fhand.close()
Can you suggest please why files remain unaffected?
Code:
def file_reader(read_from_file):
with open(read_from_file, 'r') as f:
return f.read()
def file_writer(read_from_file, write_to_file):
with open(write_to_file, 'w') as f:
f.write(file_reader(read_from_file))
Usage:
Create a file named example.txt with the following content:
Hi my name is Dmitrii Gangan.
Create an empty file called file_to_be_written_to.txt
Add this as the last line file_writer("example.txt", "file_to_be_written_to.txt") of your .py python file.
python <your_python_script.py> from the terminal.
NOTE: They all must be in the same folder.
Result:
file_to_be_written_to.txt:
Hi my name is Dmitrii Gangan.
This program should do as you requested and allows for modifying the file as it is being read. Each line is read, converted to uppercase, and then written back to the source file. Since it runs on a line-by-line basis, the most extra memory it should need would be related to the length of the longest line.
Example 1
def main():
with get_file('Enter filename: ') as file:
while True:
position = file.tell() # remember beginning of line
line = file.readline() # get the next available line
if not line: # check if at end of the file
break # program is finished at EOF
file.seek(position) # go back to the line's start
file.write(line.upper()) # write the line in uppercase
def get_file(prompt):
while True:
try: # run and catch any error
return open(input(prompt), 'r+t') # r+t = read, write, text
except EOFError: # see if user if finished
raise SystemExit() # exit the program if so
except OSError as error: # check for file problems
print(error) # report operation errors
if __name__ == '__main__':
main()
The following is similar to what you see up above but works in binary mode instead of text mode. Instead of operating on lines, it processes the file in chunks based on the given BUFFER_SIZE and can operate more efficiently. The code under the main loop may replace the code in the loop if you wish for the program to check that it is operating correctly. The assert statements check some assumptions.
Example 2
BUFFER_SIZE = 1 << 20
def main():
with get_file('Enter filename: ') as file:
while True:
position = file.tell()
buffer = file.read(BUFFER_SIZE)
if not buffer:
return
file.seek(position)
file.write(buffer.upper())
# The following code will not run but can replace the code in the loop.
start = file.tell()
buffer = file.read(BUFFER_SIZE)
if not buffer:
return
stop = file.tell()
assert file.seek(start) == start
assert file.write(buffer.upper()) == len(buffer)
assert file.tell() == stop
def get_file(prompt):
while True:
try:
return open(input(prompt), 'r+b')
except EOFError:
raise SystemExit()
except OSError as error:
print(error)
if __name__ == '__main__':
main()
I suggest the following approach:
1) Read/close the file, return the filename and content
2) Create a new file with above filename, and content with UPPERCASE
def open_f():
while True:
fname=raw_input("Enter filename:")
if fname != "done":
try:
with open(fname, "r+") as fhand:
ss = fhand.read()
break
except:
print "WRONG!!!"
continue
else: exit()
return fname, ss
fname, ss =open_f()
with open(fname, "w+") as fhand:
fhand.write(ss.upper())
Like already alluded to in comments, you cannot successively read from and write to the same file -- the first write will truncate the file, so you cannot read anything more from the handle at that point.
Fortunately, the fileinput module offers a convenient inplace mode which works exactly like you want.
import fileinput
for line in fileinput.input(somefilename, inplace=True):
print(line.upper().strip())

To compare two pointer locations in Python

Code
import sys
import os
fp = open("/home/masi/r3.raw", "rb")
try:
events = []
while aBuf[:4] != b'\xFA\xFA\xFA\xFA':
aBuf = fp.read(4)
events.append(aBuf)
if aBuf == os.SEEK_END:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[os.SEEK_END - 144 : os.SEEK_END]
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
fp.close()
where I know that the following is never true
if aBuf == os.SEEK_END:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[os.SEEK_END - 144 : os.SEEK_END]
I am comparing the pointer with the end pointer of the file, at least I am expecting so but it does not seem to correct.
Improved Code from skrrgwasme and martineau's contributions
import sys
import os
import struct
import binascii
file_name = "/home/masi/r.raw"
file_size = os.path.getsize(file_name)
print "File size is : ", file_size
read_size = 4
read_count = 0
aBuf = b'\x00\x00\x00\x00' # don't forget to create your variables before you try to read from them
fileEnding = ""
fp = open(file_name, "rb")
try:
aBuf = fp.read(read_size)
read_count += read_size
event_starts = []
event_ends = []
event_starts.append(read_count)
while aBuf and read_count < file_size:
if aBuf[:read_size] == b'\xFA\xFA\xFA\xFA':
event_ends.append(read_count)
if read_count + 1 < file_size: event_starts.append(read_count + 1)
aBuf = fp.read(read_size)
read_count += read_size
print "RC ", read_count, ", remaining: ", 1.0-float(read_count)/file_size, "%"
if read_count >= file_size: break
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
# store to partial index of postgres database: event pointers
fp.close()
How can you compare location of two pointers?
If you take a look at the Python source code for the os module, you'll see that os.SEEK_END isn't automatically set to the size of your file. It's just a constant that is set equal to the integer 2. It is intended to be used as a parameter for the lseek() function.
You need to get the file size in bytes first, then compare your file pointer to that. You can use os.path.getsize(path) to get your file size in bytes. Your comparison was never true because you were reading four bytes at a time, so your file pointer skipped from byte 0 to byte 4, passing over 2, which is the value of os.SEEK_END.
Suggested code:
import sys
import os
file_name = "/home/masi/r3.raw"
file_size = os.path.getsize(file_name)
read_size = 4
read_count = 0
# you could use fp.tell() in the loop instead of manually incrementing
# your own count of the file position instead, but this will avoid a lot of
# extra fp.tell() calls in the loop
aBuf = b'\x00\x00\x00\x00' # don't forget to create your variables before you try to
# read from them
fp = open(file_name, "rb")
try:
events = []
while aBuf[:read_size] != b'\xFA\xFA\xFA\xFA':
aBuf = fp.read(read_size)
events.append(aBuf)
read_count += read_size
if read_count >= file_size:
# pointer cannot be outside of file so minus 144
fileEnding = aBuf[file_size - 144 : file_size]
break
except:
print "File end at position : ", fp.tell()
import traceback
traceback.print_exc()
finally:
fp.close()
Notes:
Instead of comparing for exactly the file size you expect, I suggest using a greater than or equal comparison (>=). Since you're reading four bytes at a time, if you have an odd file size, your comparison will never be true.
After you get this code working, I'd suggest taking it over to Code Review Stack Exchange. As martineau has helpfully pointed out in the comments, there are a number of issues and potential pitfalls in your code that are worth correcting.

Split large text file(around 50GB) into multiple files

I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-
I've to open the file
then using readline() have to read the file line by line and write at the same time to a new file
and as soon as it hits the maximum number of lines it will create another file and
starts writing again.
I'm wondering, how to put all these steps together in a memory efficient and faster way. I've seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.
This working solution uses split command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.
First, I created a test file with 1000M entries (15 GB) with
awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
Then I used split:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
It took 5 min to produce a set of 34 small files with names t00-t33. 33 files are 458 MB each and the last t33 is 153 MB.
from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)
I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}\n'.format(line))
If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.
This class may solve your problem.
I've tested it on Linux and Windows operating system, and it's worked perfectly on both of them.
Also, I've tested binary and text file with different sizes each time and it was great.
Enjoy :)
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "\r\n" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("\rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()
I am writing a Python3 code solution which I usually use to split files having size in MBs.
However, I have not yet tried for files having size in GBs.
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increment file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"\n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :
1.txt
2.txt
3.txt
.
.
.
.
n.txt

Process not ending in python

I have a script to read a file that can be 10s of gigs big and i want to use multiprocessing to process it.
This is a compression algorithm where i want the user to define a buffer, then 3 processes will start, one to read the buffer amount of lines from the file, pass the lines to the processing process, then pass the processed lines to a process that writes the lines to a new file. I want all this to happen simultaneously, and for each process to wait for the next bundle of lines.
I already have the script, but when i run it, it doesn't end. I think something is wrong with the processes. I think it has to do with the islice in my read function, but i don't know how to write it better.
import multiprocessing as mp
import time
from itertools import islice
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
slice = islice(file, buffer)
while slice:
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
process_queue.put('kill')
def write(to_filename, write_queue):
to_file = open(to_filename, 'a+')
while 1:
to_write = write_queue.get()
if to_write == 'kill':
break
to_file.write(to_write + '\n')
def compress(process_queue, write_queue):
while 1:
to_process = process_queue.get()
if to_process == 'kill':
write_queue.put('kill')
break
# process, put output in to_write
write_queue.put(to_write)
def decompress(process_queue, write_queue):
while 1:
to_process = process_queue.get()
if to_process == 'kill':
write_queue.put('kill')
break
# process, put output in to_write
write_queue.put(to_write)
def main():
option = raw_input("C for Compress OR D for Decompress: ")
from_file = raw_input("Enter input filename: ")
buf = int(raw_input("Enter line buffer: "))
to_file = raw_input("Enter output filename: ")
start = time.time()
write_queue = mp.Queue()
process_queue = mp.Queue()
reader = mp.Process(target=read, args=(from_file, buf, process_queue))
writer = mp.Process(target=write, args=(to_file, write_queue))
if option == 'c' or option == 'C':
processor = mp.Process(target=compress, args=(process_queue, write_queue))
elif option == 'd' or option == 'D':
processor = mp.Process(target=decompress, args=(process_queue, write_queue))
else:
print "Invalid Options..."
writer.start()
processor.start()
reader.start()
reader.join()
processor.join()
writer.join()
end = time.time()
elapsed = (end - start)
print "\n\nTotal Time Elapsed: " + str(elapsed) + " secs"
if __name__=='__main__':
main()
This is my first attempt at multiprocessing.
When i run it, it doesn't end. I think a process is stuck somewhere.
This piece of code is wrong:
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
slice = islice(file, buffer)
while slice:
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
process_queue.put('kill')
Since slice is a islice object the condition while slice will always be true, hence it's like having a while True there. You should re-create the slice object every time.
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
while True:
slice = islice(file, buffer)
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
if not to_process:
# input ended
break
process_queue.put('kill')
Alternatively you could do:
def read_chunk(file, buffer):
return [file.readline() for _ in xrange(buffer)]
# or, "more" equivalent to using islice
#return [line for i,line in itertools.izip(xrange(buffer), file)]
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
for to_process in iter(lambda: read_chunk(file, buffer), []):
process_queue.put(to_process)
process_queue.put('kill')
Note that it doesn't make sense to use itertools.islice if you have to build a list anyway.

Categories

Resources