I am trying to open specific lines of multiple files and return the lines of each file. My solution is taking quite time-consuming. do you have any suggestion?
func.filename: the name of the given file
func.start_line: the starting point in the given file
func.endline: finishing point in the given file
def method_open(func):
try:
body = open(func.filename).readlines()[func.start_line:
func.end_line]
except IOError:
body = []
stderr.write("\nCouldn't open the referenced method inside {0}".
format(func.filename))
stderr.flush()
return body
Have in mind that sometimes the opening file func.filename can be the same but unfortunately, this is not the case most of the time.
The problem with readlines is that it reads the whole file into memory and linecache does the same.
You can save some time by reading one line at a time and breaking the loop as soon as you reach func.endline
But the best method i found is to use itertools.islice
Here the results of some tests I have done on a 130MB file of ~9701k lines:
--- 1.43700003624 seconds --- f_readlines
--- 1.00099992752 seconds --- f_enumerate
--- 1.1400001049 seconds --- f_linecache
--- 0.0 seconds --- f_itertools_islice
Here you can find the script I used
import time
import linecache
import itertools
def f_readlines(filename, start_line, endline):
with open(filename) as f:
f.readlines()[5000:10000]
def f_enumerate(filename, start_line, endline):
result = []
with open(filename) as f:
for i, line in enumerate(f):
if i in range(start_line, endline):
result.append(line)
if i > endline:
break
def f_linecache(filename, start_line, endline):
result = []
for n in range(start_line, endline):
result.append(linecache.getline(filename, n))
def f_itertools_islice(filename, start_line, endline):
result = []
with open(filename) as f:
resultt = itertools.islice(f, start_line, endline)
for i in resultt:
result.append(i)
def runtest(func_to_test):
filename = "testlongfile.txt"
start_line = 5000
endline = 10000
start_time = time.time()
func_to_test(filename, start_line, endline)
print("--- %s seconds --- %s" % ((time.time() - start_time),func_to_test.__name__))
runtest(f_readlines)
runtest(f_enumerate)
runtest(f_linecache)
runtest(f_itertools_islice)
Related
I’m learning python 3, and I’m dealing with a huge txt file (~800Gb).
The enclosed function 'kmers_dic' while it read the main file, if the condition in if statement is satisfied, it should append the line in the previously created files (these files are 1024 and they are named with content of the kmers variable). The function work fine with a subset of the principal file, but when I run the code using the main file, my job is killed because I reached a memory usage limit.
def OpenFiles(i):
'''
A switch to handle file opening and reduce duplicated code
'''
open_method = {
"gz": gzip.open,
"norm": open
}
return open_method[i]
def rows(f, chunksize=102400, sep='\n'):
"""
Read a file where the row separator is '\n' lazily.
Default chunk size: 102400kB 100Mb.
Usage:
>>> with open('big.csv') as f:
>>> for r in rows(f):
>>> process(r)
"""
curr_row = ''
while True:
chunk = f.read(chunksize)
if chunk == '': # End of file
break
while True:
i = chunk.find(sep)
if i == -1:
break
yield curr_row + chunk[:i]
curr_row = ''
chunk = chunk[i+1:]
curr_row += chunk
def kmers_dic(input_file,kmers,out_dir):
'''
file writing by kmers
'''
#kmers_dic = set()
count_line=0
count_line_1=0
if input_file.endswith('.gz'):
nano_read = OpenFiles('gz')
else:
nano_read = OpenFiles('norm')
with nano_read(input_file, 'rt') as nano_f:
chunk = rows(nano_f,chunksize=2024,sep='\n')
for line in chunk:
count_line+=1
count_line_1+=1
sys.stdout.write('%s\r' % count_line)
sys.stdout.flush()
line = line.strip('\n')
line = line.split()
if line[2] in kmers:
kmer = line[2]
Out_f_name = out_dir+line[2]+'.lib'
file1 = open(Out_f_name, 'a')
##file1.write('\t'.join(line) + '\n') # print entire line
file1.write('\t'.join(line[1:4:]+line[6:9:]+line[9:13:]+line[15:]) + '\n')
file1.close()
print("lines: ",count_line_1)
I'm not understanding where is the issue.
Can you help me ?
Thanks in advance!
Best.
curr_row += chunk causes you keep all chunks in memory until you run out of free memory.
I am fairly new to python and I trying to capture the last line on a syslog file using python but unable to do so. This is a huge log file so I want to avoid loading the complete file in memory. I just want to read the last line of the file and capture the timestamp for further analysis.
I have the below code which captures all the timestamps into a python dict which take a really long time to run for it to get to the last timestamp once it completed my plan was to reverse the list and capture the first object in the index[0]:
The lastFile function uses glob module and gives me the most latest log file name which is being fed into recentEdit of the main function.
Is there a better way of doing this
Script1:
#!/usr/bin/python
import glob
import os
import re
def main():
syslogDir = (r'Location/*')
listOfFiles = glob.glob(syslogDir)
recentEdit = lastFile(syslogDir)
print(recentEdit)
astack=[]
with open(recentEdit, "r") as f:
for line in f:
result = [re.findall(r'\d{4}.\d{2}.\d{2}T\d{2}.\d{2}.\d{2}.\d+.\d{2}.\d{2}',line)]
print(result)
def lastFile(i):
listOfFiles = glob.glob(i)
latestFile = max(listOfFiles, key=os.path.getctime)
return(latestFile)
if __name__ == '__main__': main()
Script2:
###############################################################################
###############################################################################
#The readline() gives me the first line of the log file which is also not what I am looking for:
#!/usr/bin/python
import glob
import os
import re
def main():
syslogDir = (r'Location/*')
listOfFiles = glob.glob(syslogDir)
recentEdit = lastFile(syslogDir)
print(recentEdit)
with open(recentEdit, "r") as f:
fLastLine = f.readline()
print(fLastLine)
# astack=[]
# with open(recentEdit, "r") as f:
# for line in f:
# result = [re.findall(r'\d{4}.\d{2}.\d{2}T\d{2}.\d{2}.\d{2}.\d+.\d{2}.\d{2}',line)]
# print(result)
def lastFile(i):
listOfFiles = glob.glob(i)
latestFile = max(listOfFiles, key=os.path.getctime)
return(latestFile)
if __name__ == '__main__': main()
I really appreciate your help!!
Sincerely.
If you want to directly go,to the end of the file. Follow these steps:
1.Every time your program runs persist or store the last '\n' index.
2.If you have persisted index of last '\n' then you can directly seek to that index using
file.seek(yourpersistedindex)
3.after this when you call file.readline() you will get the lines starting from yourpersistedindex.
4.Store this index everytime your are running your script.
For Example:
you file log.txt has content like:
timestamp1 \n
timestamp2 \n
timestamp3 \n
import pickle
lastNewLineIndex = None
#here trying to read the lastNewLineIndex
try:
rfile = open('pickledfile', 'rb')
lastNewLineIndex = pickle.load(rfile)
rfile.close()
except:
pass
logfile = open('log.txt','r')
newLastNewLineIndex = None
if lastNewLineIndex:
#seek(index) will take filepointer to the index
logfile.seek(lastNewLineIndex)
#will read the line starting from the index we provided in seek function
lastLine = logfile.readline()
print(lastLine)
#tell() gives you the current index
newLastNewLineIndex = logfile.tell()
logfile.close()
else:
counter = 0
text = logfile.read()
for c in text:
if c == '\n':
newLastNewLineIndex = counter
counter+=1
#here saving the new LastNewLineIndex
wfile = open('pickledfile', 'wb')
pickle.dump(newLastNewLineIndex,wfile)
wfile.close()
I have a huge XML file (almost 5Gig). I try to search throughout the file, find some tags and rename them. I used the same idea in here to chunk the file into 10 megabyte chunks, search through each chunk, if that chunk contains the search item then send the chunk to another helper to read the chunk line by line and replace the tag. It does not work! It seems when it tries to merge the queues and write the file back it does not work and the result file start from somewhere arbitrary.
import re, threading, Queue
FILE_R = "C:\\Users\\USOMZIA\Desktop\\ABB_Work\\ERCOT\\Modifying_cim_model\\omid2.xml"
FILE_WR = "C:\\Users\\USOMZIA\Desktop\\ABB_Work\\ERCOT\\Modifying_cim_model\\x3.xml"
def get_chunks(file_r, size = 1024 * 1024):
with open(file_r, 'rb') as f:
while 1:
start = f.tell()
f.seek(size, 1)
s = f.readline()
yield start, f.tell() - start
if not s:
break
def process_line_by_line(file_r, chunk):
with open(file_r, "rb") as f:
f.seek(chunk[0])
read_line_list = []
for line_f in f.read(chunk[1]).splitlines():
find_match = False
for match_str in mapp:
if match_str in str(line_f):
find_match = True
new_line = str(line_f).replace(match_str, mapp[match_str])
read_line_list.append(new_line)
break
if not find_match:
read_line_list.append(str(line_f))
return read_line_list
def process(file_r, chunk):
read_group_list = []
with open(file_r, "r") as f:
f.seek(chunk[0])
s = f.read(chunk[1])
if len(pattern.findall(s)) > 0:
read_group_list = process_line_by_line(file_r, chunk)
else:
read_group_list = f.read(chunk[1]).splitlines()
return read_group_list
class Worker(threading.Thread):
def run(self):
while 1:
chunk = queue.get()
if chunk is None:
break
result.append(process(*chunk))
queue.task_done()
import time, sys
start_time = time.time()
pattern_list = []
mapp = {"cim:ConformLoad rdf:ID": "cim:CustomerLoad rdf:ID", "cim:Load rdf:ID": "cim:CustomerLoad rdf:ID", "cim:NonConformLoad rdf:ID": "cim:CustomerLoad rdf:ID",
"cim:InductionMotorLoad rdf:ID": "cim:CustomerLoad rdf:ID", "cim:NonConformLoadGroup rdf:ID": "cim:ConformLoadGroup rdf:ID",
"cim:NonConformLoad.LoadGroup": "cim:ConformLoad.LoadGroup",
"/cim:ConformLoad>": "/cim:CustomerLoad>", "/cim:Load>": "/cim:CustomerLoad>", "/cim:NonConformLoad>": "/cim:CustomerLoad>",
"/cim:InductionMotorLoad>": "/cim:CustomerLoad>", "/cim:NonConformLoadGroup>": "/cim:ConformLoadGroup>"}
reg_string =""
for key in mapp:
reg_string = reg_string + key+ "|"
# to delete the last |
reg_string = list(reg_string)[:-1]
reg_string = ''.join(reg_string)
pattern = re.compile(r"cim:%s.*" %reg_string)
# This makes it faster than write an mo = pattern.search(line) in the loop
search = pattern.search
queue = Queue.Queue()
result = []
# Start the multithread
for i in range(1):
w = Worker()
w.setDaemon(1)
w.start()
chunks = get_chunks(FILE_R, 10 * 1024 * 1024)
for chunk in chunks:
print chunk
queue.put((FILE_R, chunk))
queue.join()
with open(FILE_WR, "w") as f:
for file_chunk in range(len(result)):
for line in result[file_chunk]:
f.write("%s\n" % line)
print time.time() - start_time
So, I think the problem is when the jobs in queue gets done they are not somehow in order and as a result it is not synchronized. Is there anyway that I can somehow synchronized them?
Thank you for the help!
I think I found what the problem is:
read_group_list = f.read(chunk[1]).splitlines()
This line in the process function creates the proble. After I replaced it with:
read_group_list = s.splitlines()
It gives me the correct file now.
I have a script to read a file that can be 10s of gigs big and i want to use multiprocessing to process it.
This is a compression algorithm where i want the user to define a buffer, then 3 processes will start, one to read the buffer amount of lines from the file, pass the lines to the processing process, then pass the processed lines to a process that writes the lines to a new file. I want all this to happen simultaneously, and for each process to wait for the next bundle of lines.
I already have the script, but when i run it, it doesn't end. I think something is wrong with the processes. I think it has to do with the islice in my read function, but i don't know how to write it better.
import multiprocessing as mp
import time
from itertools import islice
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
slice = islice(file, buffer)
while slice:
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
process_queue.put('kill')
def write(to_filename, write_queue):
to_file = open(to_filename, 'a+')
while 1:
to_write = write_queue.get()
if to_write == 'kill':
break
to_file.write(to_write + '\n')
def compress(process_queue, write_queue):
while 1:
to_process = process_queue.get()
if to_process == 'kill':
write_queue.put('kill')
break
# process, put output in to_write
write_queue.put(to_write)
def decompress(process_queue, write_queue):
while 1:
to_process = process_queue.get()
if to_process == 'kill':
write_queue.put('kill')
break
# process, put output in to_write
write_queue.put(to_write)
def main():
option = raw_input("C for Compress OR D for Decompress: ")
from_file = raw_input("Enter input filename: ")
buf = int(raw_input("Enter line buffer: "))
to_file = raw_input("Enter output filename: ")
start = time.time()
write_queue = mp.Queue()
process_queue = mp.Queue()
reader = mp.Process(target=read, args=(from_file, buf, process_queue))
writer = mp.Process(target=write, args=(to_file, write_queue))
if option == 'c' or option == 'C':
processor = mp.Process(target=compress, args=(process_queue, write_queue))
elif option == 'd' or option == 'D':
processor = mp.Process(target=decompress, args=(process_queue, write_queue))
else:
print "Invalid Options..."
writer.start()
processor.start()
reader.start()
reader.join()
processor.join()
writer.join()
end = time.time()
elapsed = (end - start)
print "\n\nTotal Time Elapsed: " + str(elapsed) + " secs"
if __name__=='__main__':
main()
This is my first attempt at multiprocessing.
When i run it, it doesn't end. I think a process is stuck somewhere.
This piece of code is wrong:
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
slice = islice(file, buffer)
while slice:
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
process_queue.put('kill')
Since slice is a islice object the condition while slice will always be true, hence it's like having a while True there. You should re-create the slice object every time.
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
while True:
slice = islice(file, buffer)
to_process = []
for line in slice:
to_process.append(line)
process_queue.put(to_process)
if not to_process:
# input ended
break
process_queue.put('kill')
Alternatively you could do:
def read_chunk(file, buffer):
return [file.readline() for _ in xrange(buffer)]
# or, "more" equivalent to using islice
#return [line for i,line in itertools.izip(xrange(buffer), file)]
def read(from_filename, buffer, process_queue):
file = open(from_filename, 'r')
for to_process in iter(lambda: read_chunk(file, buffer), []):
process_queue.put(to_process)
process_queue.put('kill')
Note that it doesn't make sense to use itertools.islice if you have to build a list anyway.
I am trying to use "from itertools import islice" in order to read a number of lines at a time from a *.las file using the liblas module. (my goal is reading chunk-bychunk)
following the question: Python how to read N number of lines at a time
islice() can be used to get the next n items of an iterator. Thus,
list(islice(f, n)) will return a list of the next n lines of the file
f. Using this inside a loop will give you the file in chunks of n
lines. At the end of the file, the list might be shorter, and finally
the call will return an empty list.
I used the the following code:
from numpy import nonzero
from liblas import file as lasfile
from itertools import islice
chunkSize = 1000000
f = lasfile.File(inFile,None,'r') # open LAS
while True:
chunk = list(islice(f,chunkSize))
if not chunk:
break
# do other stuff
but i have this problem:
len(f)
2866390
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
chunk = list(islice(f, 1000000))
len(chunk)
**866390**
chunk = list(islice(f, 1000000))
len(chunk)
**1000000**
when the file f arrives in the end the islice restart to read the file.
Thanks for any suggestions and help. It's very appreciate
It seems like it would be easy enough to write a generator to yield n lines at a time:
def n_line_iterator(fobj,n):
if n < 1:
raise ValueError("Must supply a positive number of lines to read")
out = []
num = 0
for line in fobj:
if num == n:
yield out #yield 1 chunk
num = 0
out = []
out.append(line)
num += 1
yield out #need to yield the rest of the lines
Change the sourcecode of file.py that belongs to the liblas package. Currently __iter__ is defined as (src on github)
def __iter__(self):
"""Iterator support (read mode only)
>>> points = []
>>> for i in f:
... points.append(i)
... print i # doctest: +ELLIPSIS
<liblas.point.Point object at ...>
"""
if self.mode == 0:
self.at_end = False
p = core.las.LASReader_GetNextPoint(self.handle)
while p and not self.at_end:
yield point.Point(handle=p, copy=True)
p = core.las.LASReader_GetNextPoint(self.handle)
if not p:
self.at_end = True
else:
self.close()
self.open()
You see that when file is at end it is closed and opened again, so iteration starts again at the beginning of the file.
Try to remove the last else block after the while, so the right code for the method should be:
def __iter__(self):
"""Iterator support (read mode only)
>>> points = []
>>> for i in f:
... points.append(i)
... print i # doctest: +ELLIPSIS
<liblas.point.Point object at ...>
"""
if self.mode == 0:
self.at_end = False
p = core.las.LASReader_GetNextPoint(self.handle)
while p and not self.at_end:
yield point.Point(handle=p, copy=True)
p = core.las.LASReader_GetNextPoint(self.handle)
if not p:
self.at_end = True