Process not ending in python - python

I have a script to read a file that can be 10s of gigs big and i want to use multiprocessing to process it.
This is a compression algorithm where i want the user to define a buffer, then 3 processes will start, one to read the buffer amount of lines from the file, pass the lines to the processing process, then pass the processed lines to a process that writes the lines to a new file. I want all this to happen simultaneously, and for each process to wait for the next bundle of lines.
I already have the script, but when i run it, it doesn't end. I think something is wrong with the processes. I think it has to do with the islice in my read function, but i don't know how to write it better.
import multiprocessing as mp
import time
from itertools import islice
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
slice = islice(file, buffer)
while slice:
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
process_queue.put('kill')
def write(to_filename, write_queue):
to_file = open(to_filename, 'a+')
while 1:
to_write = write_queue.get()
if to_write == 'kill':
break
to_file.write(to_write + '\n')
def compress(process_queue, write_queue):
while 1:
to_process = process_queue.get()
if to_process == 'kill':
write_queue.put('kill')
break
# process, put output in to_write
write_queue.put(to_write)
def decompress(process_queue, write_queue):
while 1:
to_process = process_queue.get()
if to_process == 'kill':
write_queue.put('kill')
break
# process, put output in to_write
write_queue.put(to_write)
def main():
option = raw_input("C for Compress OR D for Decompress: ")
from_file = raw_input("Enter input filename: ")
buf = int(raw_input("Enter line buffer: "))
to_file = raw_input("Enter output filename: ")
start = time.time()
write_queue = mp.Queue()
process_queue = mp.Queue()
reader = mp.Process(target=read, args=(from_file, buf, process_queue))
writer = mp.Process(target=write, args=(to_file, write_queue))
if option == 'c' or option == 'C':
processor = mp.Process(target=compress, args=(process_queue, write_queue))
elif option == 'd' or option == 'D':
processor = mp.Process(target=decompress, args=(process_queue, write_queue))
else:
print "Invalid Options..."
writer.start()
processor.start()
reader.start()
reader.join()
processor.join()
writer.join()
end = time.time()
elapsed = (end - start)
print "\n\nTotal Time Elapsed: " + str(elapsed) + " secs"
if __name__=='__main__':
main()
This is my first attempt at multiprocessing.
When i run it, it doesn't end. I think a process is stuck somewhere.

This piece of code is wrong:
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
slice = islice(file, buffer)
while slice:
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
process_queue.put('kill')
Since slice is a islice object the condition while slice will always be true, hence it's like having a while True there. You should re-create the slice object every time.
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
while True:
slice = islice(file, buffer)
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
if not to_process:
# input ended
break
process_queue.put('kill')
Alternatively you could do:
def read_chunk(file, buffer):
return [file.readline() for _ in xrange(buffer)]
# or, "more" equivalent to using islice
#return [line for i,line in itertools.izip(xrange(buffer), file)]
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
for to_process in iter(lambda: read_chunk(file, buffer), []):
process_queue.put(to_process)
process_queue.put('kill')
Note that it doesn't make sense to use itertools.islice if you have to build a list anyway.

Related

python3: Read huge file (~ 800 Gb), split the lines by condition, and append them to the end of new files in a memory-efficient way

I’m learning python 3, and I’m dealing with a huge txt file (~800Gb).
The enclosed function 'kmers_dic' while it read the main file, if the condition in if statement is satisfied, it should append the line in the previously created files (these files are 1024 and they are named with content of the kmers variable). The function work fine with a subset of the principal file, but when I run the code using the main file, my job is killed because I reached a memory usage limit.
def OpenFiles(i):
'''
A switch to handle file opening and reduce duplicated code
'''
open_method = {
"gz": gzip.open,
"norm": open
}
return open_method[i]
def rows(f, chunksize=102400, sep='\n'):
"""
Read a file where the row separator is '\n' lazily.
Default chunk size: 102400kB 100Mb.
Usage:
>>> with open('big.csv') as f:
>>> for r in rows(f):
>>> process(r)
"""
curr_row = ''
while True:
chunk = f.read(chunksize)
if chunk == '': # End of file
break
while True:
i = chunk.find(sep)
if i == -1:
break
yield curr_row + chunk[:i]
curr_row = ''
chunk = chunk[i+1:]
curr_row += chunk
def kmers_dic(input_file,kmers,out_dir):
'''
file writing by kmers
'''
#kmers_dic = set()
count_line=0
count_line_1=0
if input_file.endswith('.gz'):
nano_read = OpenFiles('gz')
else:
nano_read = OpenFiles('norm')
with nano_read(input_file, 'rt') as nano_f:
chunk = rows(nano_f,chunksize=2024,sep='\n')
for line in chunk:
count_line+=1
count_line_1+=1
sys.stdout.write('%s\r' % count_line)
sys.stdout.flush()
line = line.strip('\n')
line = line.split()
if line[2] in kmers:
kmer = line[2]
Out_f_name = out_dir+line[2]+'.lib'
file1 = open(Out_f_name, 'a')
##file1.write('\t'.join(line) + '\n') # print entire line
file1.write('\t'.join(line[1:4:]+line[6:9:]+line[9:13:]+line[15:]) + '\n')
file1.close()
print("lines: ",count_line_1)
I'm not understanding where is the issue.
Can you help me ?
Thanks in advance!
Best.
curr_row += chunk causes you keep all chunks in memory until you run out of free memory.

multiprocessing always returns an empty file when doing apply_async

I have a file (input.txt) containing half-a-million lines, and I want to encrypt these lines with my encrypt function, and save them to one single file called output.txt. For example the input.txt is
aab
abb
abc
Then I want to have my output.txt to be
001
011
012
Simple for loop version
I have a working for loop, however it takes nearly 9 hours to encrypt all the lines:
encryption_map = {}
encryption_map['a']=0
encryption_map['b']=1
encryption_map['c']=2
def encrypt(input_str):
output_int = ''
for i in input_str:
for ch in i.split('\n')[0]: # remove line break symbol \n
output_int += str(encryption_map[ch])
return output_int
text_path = 'input.txt'
with open(text_path, 'r') as input_file:
lines = input_file.readlines()
with open('output.txt', 'w') as output_file:
for l in lines:
output_int = encrypt(l)
output_file.write(output_int + '\n')
apply_async version
Since I want to keep the same ordering, in the output.txt, it seems I have to use apply_async. Then my code becomes:
import multiprocessing as mp
encryption_map = {}
encryption_map['a']=0
encryption_map['b']=1
encryption_map['c']=2
def encrypt(input_str):
output_int = ''
for i in input_str:
for ch in i.split('\n')[0]: # remove line break symbol \n
output_int += str(encryption_map[ch])
return output_int
def write_result(output):
output_file.write(ipa_output + '\n')
# output_file.flush() # This line is suggested by another stack question
pool = mp.Pool(20)
text_path = 'input.txt'
with open(text_path, 'r') as input_file:
lines = input_file.readlines()
with open('output.txt', 'w') as output_file:
for l in lines:
pool.apply_async(encrypt, args=l, callback=write_result)
pool.close()
pool.join()
It runs much faster, however, the output.txt is always empty. What's wrong with my code? I found one post that also has difficulty in writing out the file, and they suggest us to put f.flush() inside the write function, but it also doesn't work.
You need to write args=(line,) like this:
import multiprocessing as mp
encryption_map = {}
encryption_map['a'] = 0
encryption_map['b'] = 1
encryption_map['c'] = 2
output_file = open('output.txt', 'w')
def encrypt(input_str):
output_int = ''
for i in input_str:
for ch in i.split('\n')[0]:
output_int += str(encryption_map[ch])
return output_int
def write_result(output):
output_file.write(output + '\n')
def main():
#mp.set_start_method('spawn') # Only needed on OSX
pool = mp.Pool(2)
with open('input.txt') as input_file:
lines = input_file.readlines()
for line in lines:
pool.apply_async(encrypt, args=(line,), callback=write_result)
pool.close()
pool.join()
output_file.close()
if __name__ == '__main__':
main()
EDIT:
In the above code, since we are using apply_async, the order of lines in the output might not be the same as that of the input.
If we want to preserve order, then we can either use map/map_async/imap.
In this case, imap might be the best option since the callback operation (IO bound) is much slower than the worker operation (CPU bound):
import multiprocessing as mp
encryption_map = {}
encryption_map['a'] = 0
encryption_map['b'] = 1
encryption_map['c'] = 2
output_file = open('output.txt', 'w')
def encrypt(input_str):
output_int = ''
for i in input_str:
for ch in i.split('\n')[0]:
output_int += str(encryption_map[ch])
return output_int
def main():
mp.set_start_method('spawn') # Only needed on OSX
pool = mp.Pool(2)
with open('input.txt') as input_file:
lines = input_file.readlines()
for output in pool.imap(encrypt, lines):
output_file.write(output + '\n')
pool.close()
pool.join()
if __name__ == '__main__':
main()

Processing a huge file in multithreading and write it back to another file,

I have a huge XML file (almost 5Gig). I try to search throughout the file, find some tags and rename them. I used the same idea in here to chunk the file into 10 megabyte chunks, search through each chunk, if that chunk contains the search item then send the chunk to another helper to read the chunk line by line and replace the tag. It does not work! It seems when it tries to merge the queues and write the file back it does not work and the result file start from somewhere arbitrary.
import re, threading, Queue
FILE_R = "C:\\Users\\USOMZIA\Desktop\\ABB_Work\\ERCOT\\Modifying_cim_model\\omid2.xml"
FILE_WR = "C:\\Users\\USOMZIA\Desktop\\ABB_Work\\ERCOT\\Modifying_cim_model\\x3.xml"
def get_chunks(file_r, size = 1024 * 1024):
with open(file_r, 'rb') as f:
while 1:
start = f.tell()
f.seek(size, 1)
s = f.readline()
yield start, f.tell() - start
if not s:
break
def process_line_by_line(file_r, chunk):
with open(file_r, "rb") as f:
f.seek(chunk[0])
read_line_list = []
for line_f in f.read(chunk[1]).splitlines():
find_match = False
for match_str in mapp:
if match_str in str(line_f):
find_match = True
new_line = str(line_f).replace(match_str, mapp[match_str])
read_line_list.append(new_line)
break
if not find_match:
read_line_list.append(str(line_f))
return read_line_list
def process(file_r, chunk):
read_group_list = []
with open(file_r, "r") as f:
f.seek(chunk[0])
s = f.read(chunk[1])
if len(pattern.findall(s)) > 0:
read_group_list = process_line_by_line(file_r, chunk)
else:
read_group_list = f.read(chunk[1]).splitlines()
return read_group_list
class Worker(threading.Thread):
def run(self):
while 1:
chunk = queue.get()
if chunk is None:
break
result.append(process(*chunk))
queue.task_done()
import time, sys
start_time = time.time()
pattern_list = []
mapp = {"cim:ConformLoad rdf:ID": "cim:CustomerLoad rdf:ID", "cim:Load rdf:ID": "cim:CustomerLoad rdf:ID", "cim:NonConformLoad rdf:ID": "cim:CustomerLoad rdf:ID",
"cim:InductionMotorLoad rdf:ID": "cim:CustomerLoad rdf:ID", "cim:NonConformLoadGroup rdf:ID": "cim:ConformLoadGroup rdf:ID",
"cim:NonConformLoad.LoadGroup": "cim:ConformLoad.LoadGroup",
"/cim:ConformLoad>": "/cim:CustomerLoad>", "/cim:Load>": "/cim:CustomerLoad>", "/cim:NonConformLoad>": "/cim:CustomerLoad>",
"/cim:InductionMotorLoad>": "/cim:CustomerLoad>", "/cim:NonConformLoadGroup>": "/cim:ConformLoadGroup>"}
reg_string =""
for key in mapp:
reg_string = reg_string + key+ "|"
# to delete the last |
reg_string = list(reg_string)[:-1]
reg_string = ''.join(reg_string)
pattern = re.compile(r"cim:%s.*" %reg_string)
# This makes it faster than write an mo = pattern.search(line) in the loop
search = pattern.search
queue = Queue.Queue()
result = []
# Start the multithread
for i in range(1):
w = Worker()
w.setDaemon(1)
w.start()
chunks = get_chunks(FILE_R, 10 * 1024 * 1024)
for chunk in chunks:
print chunk
queue.put((FILE_R, chunk))
queue.join()
with open(FILE_WR, "w") as f:
for file_chunk in range(len(result)):
for line in result[file_chunk]:
f.write("%s\n" % line)
print time.time() - start_time
So, I think the problem is when the jobs in queue gets done they are not somehow in order and as a result it is not synchronized. Is there anyway that I can somehow synchronized them?
Thank you for the help!
I think I found what the problem is:
read_group_list = f.read(chunk[1]).splitlines()
This line in the process function creates the proble. After I replaced it with:
read_group_list = s.splitlines()
It gives me the correct file now.

Split large text file(around 50GB) into multiple files

I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-
I've to open the file
then using readline() have to read the file line by line and write at the same time to a new file
and as soon as it hits the maximum number of lines it will create another file and
starts writing again.
I'm wondering, how to put all these steps together in a memory efficient and faster way. I've seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.
This working solution uses split command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.
First, I created a test file with 1000M entries (15 GB) with
awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
Then I used split:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
It took 5 min to produce a set of 34 small files with names t00-t33. 33 files are 458 MB each and the last t33 is 153 MB.
from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)
I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}\n'.format(line))
If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.
This class may solve your problem.
I've tested it on Linux and Windows operating system, and it's worked perfectly on both of them.
Also, I've tested binary and text file with different sizes each time and it was great.
Enjoy :)
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "\r\n" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("\rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()
I am writing a Python3 code solution which I usually use to split files having size in MBs.
However, I have not yet tried for files having size in GBs.
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increment file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"\n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :
1.txt
2.txt
3.txt
.
.
.
.
n.txt

Problems with islice to read N number of lines at a time

I am trying to use "from itertools import islice" in order to read a number of lines at a time from a *.las file using the liblas module. (my goal is reading chunk-bychunk)
following the question: Python how to read N number of lines at a time
islice() can be used to get the next n items of an iterator. Thus,
list(islice(f, n)) will return a list of the next n lines of the file
f. Using this inside a loop will give you the file in chunks of n
lines. At the end of the file, the list might be shorter, and finally
the call will return an empty list.
I used the the following code:
from numpy import nonzero
from liblas import file as lasfile
from itertools import islice
chunkSize = 1000000
f = lasfile.File(inFile,None,'r') # open LAS
while True:
chunk = list(islice(f,chunkSize))
if not chunk:
break
# do other stuff
but i have this problem:
len(f)
2866390
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
chunk = list(islice(f, 1000000))
len(chunk)
**866390**
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
when the file f arrives in the end the islice restart to read the file.
Thanks for any suggestions and help. It's very appreciate
It seems like it would be easy enough to write a generator to yield n lines at a time:
def n_line_iterator(fobj,n):
if n < 1:
raise ValueError("Must supply a positive number of lines to read")
out = []
num = 0
for line in fobj:
if num == n:
yield out #yield 1 chunk
num = 0
out = []
out.append(line)
num += 1
yield out #need to yield the rest of the lines
Change the sourcecode of file.py that belongs to the liblas package. Currently __iter__ is defined as (src on github)
def __iter__(self):
"""Iterator support (read mode only)
>>> points = []
>>> for i in f:
... points.append(i)
... print i # doctest: +ELLIPSIS
<liblas.point.Point object at ...>
"""
if self.mode == 0:
self.at_end = False
p = core.las.LASReader_GetNextPoint(self.handle)
while p and not self.at_end:
yield point.Point(handle=p, copy=True)
p = core.las.LASReader_GetNextPoint(self.handle)
if not p:
self.at_end = True
else:
self.close()
self.open()
You see that when file is at end it is closed and opened again, so iteration starts again at the beginning of the file.
Try to remove the last else block after the while, so the right code for the method should be:
def __iter__(self):
"""Iterator support (read mode only)
>>> points = []
>>> for i in f:
... points.append(i)
... print i # doctest: +ELLIPSIS
<liblas.point.Point object at ...>
"""
if self.mode == 0:
self.at_end = False
p = core.las.LASReader_GetNextPoint(self.handle)
while p and not self.at_end:
yield point.Point(handle=p, copy=True)
p = core.las.LASReader_GetNextPoint(self.handle)
if not p:
self.at_end = True

Categories

Resources