Problems with islice to read N number of lines at a time - python

I am trying to use "from itertools import islice" in order to read a number of lines at a time from a *.las file using the liblas module. (my goal is reading chunk-bychunk)
following the question: Python how to read N number of lines at a time
islice() can be used to get the next n items of an iterator. Thus,
list(islice(f, n)) will return a list of the next n lines of the file
f. Using this inside a loop will give you the file in chunks of n
lines. At the end of the file, the list might be shorter, and finally
the call will return an empty list.
I used the the following code:
from numpy import nonzero
from liblas import file as lasfile
from itertools import islice
chunkSize = 1000000
f = lasfile.File(inFile,None,'r') # open LAS
while True:
chunk = list(islice(f,chunkSize))
if not chunk:
break
# do other stuff
but i have this problem:
len(f)
2866390
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
chunk = list(islice(f, 1000000))
len(chunk)
**866390**
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
when the file f arrives in the end the islice restart to read the file.
Thanks for any suggestions and help. It's very appreciate

It seems like it would be easy enough to write a generator to yield n lines at a time:
def n_line_iterator(fobj,n):
if n < 1:
raise ValueError("Must supply a positive number of lines to read")
out = []
num = 0
for line in fobj:
if num == n:
yield out #yield 1 chunk
num = 0
out = []
out.append(line)
num += 1
yield out #need to yield the rest of the lines

Change the sourcecode of file.py that belongs to the liblas package. Currently __iter__ is defined as (src on github)
def __iter__(self):
"""Iterator support (read mode only)
>>> points = []
>>> for i in f:
... points.append(i)
... print i # doctest: +ELLIPSIS
<liblas.point.Point object at ...>
"""
if self.mode == 0:
self.at_end = False
p = core.las.LASReader_GetNextPoint(self.handle)
while p and not self.at_end:
yield point.Point(handle=p, copy=True)
p = core.las.LASReader_GetNextPoint(self.handle)
if not p:
self.at_end = True
else:
self.close()
self.open()
You see that when file is at end it is closed and opened again, so iteration starts again at the beginning of the file.
Try to remove the last else block after the while, so the right code for the method should be:
def __iter__(self):
"""Iterator support (read mode only)
>>> points = []
>>> for i in f:
... points.append(i)
... print i # doctest: +ELLIPSIS
<liblas.point.Point object at ...>
"""
if self.mode == 0:
self.at_end = False
p = core.las.LASReader_GetNextPoint(self.handle)
while p and not self.at_end:
yield point.Point(handle=p, copy=True)
p = core.las.LASReader_GetNextPoint(self.handle)
if not p:
self.at_end = True

Related

python3: Read huge file (~ 800 Gb), split the lines by condition, and append them to the end of new files in a memory-efficient way

I’m learning python 3, and I’m dealing with a huge txt file (~800Gb).
The enclosed function 'kmers_dic' while it read the main file, if the condition in if statement is satisfied, it should append the line in the previously created files (these files are 1024 and they are named with content of the kmers variable). The function work fine with a subset of the principal file, but when I run the code using the main file, my job is killed because I reached a memory usage limit.
def OpenFiles(i):
'''
A switch to handle file opening and reduce duplicated code
'''
open_method = {
"gz": gzip.open,
"norm": open
}
return open_method[i]
def rows(f, chunksize=102400, sep='\n'):
"""
Read a file where the row separator is '\n' lazily.
Default chunk size: 102400kB 100Mb.
Usage:
>>> with open('big.csv') as f:
>>> for r in rows(f):
>>> process(r)
"""
curr_row = ''
while True:
chunk = f.read(chunksize)
if chunk == '': # End of file
break
while True:
i = chunk.find(sep)
if i == -1:
break
yield curr_row + chunk[:i]
curr_row = ''
chunk = chunk[i+1:]
curr_row += chunk
def kmers_dic(input_file,kmers,out_dir):
'''
file writing by kmers
'''
#kmers_dic = set()
count_line=0
count_line_1=0
if input_file.endswith('.gz'):
nano_read = OpenFiles('gz')
else:
nano_read = OpenFiles('norm')
with nano_read(input_file, 'rt') as nano_f:
chunk = rows(nano_f,chunksize=2024,sep='\n')
for line in chunk:
count_line+=1
count_line_1+=1
sys.stdout.write('%s\r' % count_line)
sys.stdout.flush()
line = line.strip('\n')
line = line.split()
if line[2] in kmers:
kmer = line[2]
Out_f_name = out_dir+line[2]+'.lib'
file1 = open(Out_f_name, 'a')
##file1.write('\t'.join(line) + '\n') # print entire line
file1.write('\t'.join(line[1:4:]+line[6:9:]+line[9:13:]+line[15:]) + '\n')
file1.close()
print("lines: ",count_line_1)
I'm not understanding where is the issue.
Can you help me ?
Thanks in advance!
Best.
curr_row += chunk causes you keep all chunks in memory until you run out of free memory.

opening specific lines inside multiple files

I am trying to open specific lines of multiple files and return the lines of each file. My solution is taking quite time-consuming. do you have any suggestion?
func.filename: the name of the given file
func.start_line: the starting point in the given file
func.endline: finishing point in the given file
def method_open(func):
try:
body = open(func.filename).readlines()[func.start_line:
func.end_line]
except IOError:
body = []
stderr.write("\nCouldn't open the referenced method inside {0}".
format(func.filename))
stderr.flush()
return body
Have in mind that sometimes the opening file func.filename can be the same but unfortunately, this is not the case most of the time.
The problem with readlines is that it reads the whole file into memory and linecache does the same.
You can save some time by reading one line at a time and breaking the loop as soon as you reach func.endline
But the best method i found is to use itertools.islice
Here the results of some tests I have done on a 130MB file of ~9701k lines:
--- 1.43700003624 seconds --- f_readlines
--- 1.00099992752 seconds --- f_enumerate
--- 1.1400001049 seconds --- f_linecache
--- 0.0 seconds --- f_itertools_islice
Here you can find the script I used
import time
import linecache
import itertools
def f_readlines(filename, start_line, endline):
with open(filename) as f:
f.readlines()[5000:10000]
def f_enumerate(filename, start_line, endline):
result = []
with open(filename) as f:
for i, line in enumerate(f):
if i in range(start_line, endline):
result.append(line)
if i > endline:
break
def f_linecache(filename, start_line, endline):
result = []
for n in range(start_line, endline):
result.append(linecache.getline(filename, n))
def f_itertools_islice(filename, start_line, endline):
result = []
with open(filename) as f:
resultt = itertools.islice(f, start_line, endline)
for i in resultt:
result.append(i)
def runtest(func_to_test):
filename = "testlongfile.txt"
start_line = 5000
endline = 10000
start_time = time.time()
func_to_test(filename, start_line, endline)
print("--- %s seconds --- %s" % ((time.time() - start_time),func_to_test.__name__))
runtest(f_readlines)
runtest(f_enumerate)
runtest(f_linecache)
runtest(f_itertools_islice)

Need to group every five or so lines of a text file into a single, concatenated line

By way of an example, I have a utf-8 dictionary text file like so:
iguanodont
primer
blindfolder
pseudosperm
chanciest
givers
renascent
lecanine
struth
unionizers
autoriser
interpunctuation
monophylies
approximativeness
I need to iterate through, group every five lines together (separated by a space), and spit out a new text file, like this:
iguanodont primer blindfolder pseudosperm chanciest
givers renascent lecanine struth unionizers
autoriser interpunctuation monophylies approximativeness
So far, I've got this. I'm very new, so I apologize this is so banal. Thank-you in advance.
import io
dictionary = io.open("shuffled.txt", 'r')
Unless your input file is so huge it won't fit in memory, reading it into a list and slicing that list is simplest -- a 3-liner or so:
allrows = list(io.open("shuffled.txt", 'r'))
byfive = [allrows[i:i+5] for i in range(0, len(allrows), 5)]
io.open('out.txt', 'w').writelines(' '.join(x)+'\n' for x in byfive])
Of course you can get much fancier to deal with unbounded files, assured closure in the case of exceptions, and so forth, but it may be best to keep it simple while that's feasible, and add complication only if warranted.
# assumes Python 3.x
from itertools import zip_longest
INPUT = "shuffled.txt"
OUTPUT = "by_fives.txt"
# from itertools documentation,
# https://docs.python.org/3.4/library/itertools.html
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
def main():
with open(INPUT) as wordfile, open(OUTPUT, "w") as result:
wordlist = (line.strip() for line in wordfile)
for fivewords in grouper(wordlist, 5, ""):
result.write(" ".join(fivewords) + "\n")
if __name__ == "__main__":
main()
read_file_name = 'words.txt'
write_file_name = 'words_grouped.txt'
def chunks(l, n):
""" Yield successive n-sized chunks from l.
Thanks Ned Batchelder
"""
for i in xrange(0, len(l), n):
yield l[i:i+n]
f = open(read_file_name)
words = f.read()
f.close()
words = words.split("\n")
grouped = list(chunks(words,5))
f2 = open(write_file_name, 'w+')
f2.write(str(grouped))
f2.close()
Not exactly what you were asking for but similar. This will generate a list of lists of the grouped data which is then cast to a string and saved to a file.
OUTPUT:
[['iguanodont', 'primer', 'blindfolder', 'pseudosperm', 'chanciest'], ['givers', 'renascent', 'lecanine', 'struth', 'unionizers'], ['autoriser', 'interpunctuation', 'monophylies', 'approximativeness', '']]
Here is my solution:
fin = open ('input.txt', 'r')
fout = open ('output.txt', 'w')
lineCount = 0
toAdd = ""
for line in fin:
if(lineCount == 5):
fout.write(toAdd + "\n")
lineCount = 1
toAdd = ""
toAdd += line.strip() + " "
else:
if(lineCount == 0):
toAdd += line.strip()
else:
toAdd += " " + line.strip()
lineCount += 1
if(lineCount != 0):
fout.write(toAdd)
fin.close()
fout.close()
with the input.txt as follows:
iguanodont
primer
blindfolder
pseudosperm
chanciest
givers
renascent
lecanine
struth
unionizers
autoriser
interpunctuation
monophylies
approximativeness
and the output.txt as follows:
iguanodont primer blindfolder pseudosperm chanciest
givers renascent lecanine struth unionizers
autoriser interpunctuation monophylies approximativeness

Split large text file(around 50GB) into multiple files

I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-
I've to open the file
then using readline() have to read the file line by line and write at the same time to a new file
and as soon as it hits the maximum number of lines it will create another file and
starts writing again.
I'm wondering, how to put all these steps together in a memory efficient and faster way. I've seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.
This working solution uses split command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.
First, I created a test file with 1000M entries (15 GB) with
awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
Then I used split:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
It took 5 min to produce a set of 34 small files with names t00-t33. 33 files are 458 MB each and the last t33 is 153 MB.
from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)
I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}\n'.format(line))
If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.
This class may solve your problem.
I've tested it on Linux and Windows operating system, and it's worked perfectly on both of them.
Also, I've tested binary and text file with different sizes each time and it was great.
Enjoy :)
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "\r\n" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("\rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()
I am writing a Python3 code solution which I usually use to split files having size in MBs.
However, I have not yet tried for files having size in GBs.
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increment file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"\n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :
1.txt
2.txt
3.txt
.
.
.
.
n.txt

Process not ending in python

I have a script to read a file that can be 10s of gigs big and i want to use multiprocessing to process it.
This is a compression algorithm where i want the user to define a buffer, then 3 processes will start, one to read the buffer amount of lines from the file, pass the lines to the processing process, then pass the processed lines to a process that writes the lines to a new file. I want all this to happen simultaneously, and for each process to wait for the next bundle of lines.
I already have the script, but when i run it, it doesn't end. I think something is wrong with the processes. I think it has to do with the islice in my read function, but i don't know how to write it better.
import multiprocessing as mp
import time
from itertools import islice
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
slice = islice(file, buffer)
while slice:
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
process_queue.put('kill')
def write(to_filename, write_queue):
to_file = open(to_filename, 'a+')
while 1:
to_write = write_queue.get()
if to_write == 'kill':
break
to_file.write(to_write + '\n')
def compress(process_queue, write_queue):
while 1:
to_process = process_queue.get()
if to_process == 'kill':
write_queue.put('kill')
break
# process, put output in to_write
write_queue.put(to_write)
def decompress(process_queue, write_queue):
while 1:
to_process = process_queue.get()
if to_process == 'kill':
write_queue.put('kill')
break
# process, put output in to_write
write_queue.put(to_write)
def main():
option = raw_input("C for Compress OR D for Decompress: ")
from_file = raw_input("Enter input filename: ")
buf = int(raw_input("Enter line buffer: "))
to_file = raw_input("Enter output filename: ")
start = time.time()
write_queue = mp.Queue()
process_queue = mp.Queue()
reader = mp.Process(target=read, args=(from_file, buf, process_queue))
writer = mp.Process(target=write, args=(to_file, write_queue))
if option == 'c' or option == 'C':
processor = mp.Process(target=compress, args=(process_queue, write_queue))
elif option == 'd' or option == 'D':
processor = mp.Process(target=decompress, args=(process_queue, write_queue))
else:
print "Invalid Options..."
writer.start()
processor.start()
reader.start()
reader.join()
processor.join()
writer.join()
end = time.time()
elapsed = (end - start)
print "\n\nTotal Time Elapsed: " + str(elapsed) + " secs"
if __name__=='__main__':
main()
This is my first attempt at multiprocessing.
When i run it, it doesn't end. I think a process is stuck somewhere.
This piece of code is wrong:
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
slice = islice(file, buffer)
while slice:
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
process_queue.put('kill')
Since slice is a islice object the condition while slice will always be true, hence it's like having a while True there. You should re-create the slice object every time.
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
while True:
slice = islice(file, buffer)
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
if not to_process:
# input ended
break
process_queue.put('kill')
Alternatively you could do:
def read_chunk(file, buffer):
return [file.readline() for _ in xrange(buffer)]
# or, "more" equivalent to using islice
#return [line for i,line in itertools.izip(xrange(buffer), file)]
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
for to_process in iter(lambda: read_chunk(file, buffer), []):
process_queue.put(to_process)
process_queue.put('kill')
Note that it doesn't make sense to use itertools.islice if you have to build a list anyway.

Categories

Resources