read file block by block in python - python

I have file that has this format
MACRO L20_FDPQ_TV1T8_1
FIXEDMASK ;
CLASS CORE ;
...
...
END L20_FDPQ_TV1T8_1
MACRO INV_20
...
...
END INV_20
I want to read the file as blocks such that each MACRO to end of its name form a block in python. I tried to use this
with open(file_name, "r") as f_read:
lines = f_read.readlines()
num = 0
while num < len(lines):
line = lines[num]
if re.search(r"MACRO\s+", line, re.IGNORECASE):
macro_name = line.split()[1]
while re.search(r"\s+END\s+%s"%macro_name, line, re.IGNORECASE) is None:
line = lines[num + 1]
#...do something...
num = num+1
num +=1
how this can be done in an efficient way?

Assumming that you cannot nest Macros, that Macros always start with "MACRO [name]" and end with "END [name]":
# read the contents of the file
with open(file_name, "r") as f_read:
lines = f_read.readlines()
current_macro_lines = []
for line in lines:
if line.startswith("MACRO"):
macro_name = line.split()[1]
# if line starts with END and refers to the current macro name
elif line.startswith("END") and line.split()[1] == macro_name:
# here the macro is complete,
#put the code you want to execute when you find the end of the macro
# then when you finish processing the lines,
# empty them so you can accumulate the next macro
current_macro_lines = []
else:
# here you can accumulate the macro lines
current_macro_lines.append(line)

Related

python3: Read huge file (~ 800 Gb), split the lines by condition, and append them to the end of new files in a memory-efficient way

I’m learning python 3, and I’m dealing with a huge txt file (~800Gb).
The enclosed function 'kmers_dic' while it read the main file, if the condition in if statement is satisfied, it should append the line in the previously created files (these files are 1024 and they are named with content of the kmers variable). The function work fine with a subset of the principal file, but when I run the code using the main file, my job is killed because I reached a memory usage limit.
def OpenFiles(i):
'''
A switch to handle file opening and reduce duplicated code
'''
open_method = {
"gz": gzip.open,
"norm": open
}
return open_method[i]
def rows(f, chunksize=102400, sep='\n'):
"""
Read a file where the row separator is '\n' lazily.
Default chunk size: 102400kB 100Mb.
Usage:
>>> with open('big.csv') as f:
>>> for r in rows(f):
>>> process(r)
"""
curr_row = ''
while True:
chunk = f.read(chunksize)
if chunk == '': # End of file
break
while True:
i = chunk.find(sep)
if i == -1:
break
yield curr_row + chunk[:i]
curr_row = ''
chunk = chunk[i+1:]
curr_row += chunk
def kmers_dic(input_file,kmers,out_dir):
'''
file writing by kmers
'''
#kmers_dic = set()
count_line=0
count_line_1=0
if input_file.endswith('.gz'):
nano_read = OpenFiles('gz')
else:
nano_read = OpenFiles('norm')
with nano_read(input_file, 'rt') as nano_f:
chunk = rows(nano_f,chunksize=2024,sep='\n')
for line in chunk:
count_line+=1
count_line_1+=1
sys.stdout.write('%s\r' % count_line)
sys.stdout.flush()
line = line.strip('\n')
line = line.split()
if line[2] in kmers:
kmer = line[2]
Out_f_name = out_dir+line[2]+'.lib'
file1 = open(Out_f_name, 'a')
##file1.write('\t'.join(line) + '\n') # print entire line
file1.write('\t'.join(line[1:4:]+line[6:9:]+line[9:13:]+line[15:]) + '\n')
file1.close()
print("lines: ",count_line_1)
I'm not understanding where is the issue.
Can you help me ?
Thanks in advance!
Best.
curr_row += chunk causes you keep all chunks in memory until you run out of free memory.

Extract blocks of text that starts with "Start Text" until it encounters another "Start Text"

Here is the code I have to extract blocks of text of a file that starts with "Start Text" until it encounters another "Start Text".
with open('temp.txt', "r") as f:
buff = []
i = 1
for line in f:
if line.strip(): skips the empty lines
buff.append(line)
if line.startswith("Start Text"):
output = open('file' + '%d.txt' % i, 'w')
output.write(''.join(buff))
output.close()
i += 1
buff = [] # buffer reset
INPUT: "temp.txt" has the following structure:
Start Text - ABCD
line1
line2
line3
Start Text - EFG
line4
Start Text - P3456
line5
line6
DESIRED OUTPUT: I am trying to create multiple text files below with extracted blocks of texts.
file1.txt
Start Text - ABCD
line1
line2
line3
file2.txt
Start Text - EFG
line4
file3.txt
Start Text - P3456
line5
line6
UNDESIRED OUTPUT (What the code produces)
file1.txt
Start Text - ABCD
file2.txt
Start Text - EFG
line1
line2
line3
file3.txt
line4
Start Text - P3456
Here is the issue I am facing. The code creates three files but does not write “Start Text” lines into their respective text blocks. I am not sure what I am missing. I will appreciate any pointers.
When the code sees "Start Text" in a line, it writes that line and all the previous lines to the output file.
This explains why the first output file contains only the header -- that is the first line in the input file, so obviously there aren't any previous lines.
It seems like what you really want is for the header and the following lines to be written.
I've updated your code to not write a file after seeing the very first header, and also to write a file after the input file is exhausted.
buff = []
i = 1
with open('temp.txt', "r") as f:
for line in f:
if line.startswith("Start Text"):
# write a file only if buff isn't empty. (if it is
# empty, this must be the very first header, so we
# don't need to write an output file yet)
if buff:
output = open('file' + '%d.txt' % i, 'w')
output.write(''.join(buff))
output.close()
i += 1
buff = [] # buffer reset
if line.strip():
buff.append(line)
# write the final section
if buff:
output = open('file' + '%d.txt' % i, 'w')
output.write(''.join(buff))
output.close()
You're almost there. See how when you check for startswith(), then write out the buffer, and clean the buffer. As it returns to the loop, if hasn't stored the line when entering this if statement - this line is lost. Try adding it to the new buffer for the next round of lines.
...
buff = [] # buffer reset
buff.append(line) # add 'Start Text' line to next buffer
Note that your code currently will never write out the final block of text. Consider to write out the last buffer as well (i.e., when no line is left).
[EDIT after question edit]
As the other answer replies, the check for startswith() causes a write to file after the line is found. However, the line has already been added to the buffer. Try switching the statements, to first detect the startswith, then write everything out if it was the case (if the buffer is not empty!), then continue by adding the line to the buffer.
(the note still stands)
def parse_file(fname):
with open(fname, "r") as f:
buff = []
for line in f:
if line.strip(): # skips the empty lines
if line.startswith("Start Text") and buff:
yield ''.join(buff)
buff = []
buff.append(line)
for idx, data in enumerate(parse_file('sample.txt'), start=1):
with open(f'file{idx}.txt', 'w') as f:
f.write(data)
I don't think you need to build a buffer. You can just process line by line as you iterate over the input file.
class MyTempFile():
def __init__(self):
self.fd = None
self.newfile = None
def __enter__(self):
return self
def __exit__(self, *args):
self.closefd()
self.newfile = None
def closefd(self):
if self.fd is not None:
self.fd.close()
self.fd = None
def newfile_impl(self):
i = 0
while True:
self.closefd()
i += 1
self.fd = open(f'temp{i}.txt', 'w')
yield
def write(self, data):
if self.fd is not None and data.strip():
self.fd.write(data)
def next_file(self):
if self.newfile is None:
self.newfile = self.newfile_impl()
next(self.newfile)
with MyTempFile() as mtf:
with open('temp.txt') as infile:
for line in infile:
if line.startswith('Start Text'):
mtf.next_file()
mtf.write(line)

What is wrong with this code? I'm trying to insert this file

I am trying to insert a file and I keep getting a syntax error on the line line = infile.redline()
def main():
# Declare variables
line = ''
counter = 0
# Prompt for file name
fileName = input('Enter the name of the file: ')
# Open the specified file for reading
infile = open('test.txt', 'r')
# Priming read
line = infile.redline()
counter = 1
# Read in and display first five lines
while line != '' and counter <= 5:
# Strip '\n'
line = line.rtrip('\n')
print(line)
1ine = infile.readline()
# Update counter when line is read
counter +=1
# Close file
infile.close()
# Call the main function.
main()
rtrip should be rstrip. redline should be readline. infile.close() should be indented, and main() should not be.
However, the most serious problem is here:
1ine = infile.readline()
That first character is a one, not an L.
Knowing the standard libraries can make your life much simpler!
from itertools import islice
def main():
fname = input('Enter the name of the file: ')
with open(fname) as inf:
for line in islice(inf, 5): # get the first 5 lines
print(line.rstrip())
if __name__=="__main__":
main()
It is not redline but readline:
line = infile.redline()

Optimizing python file search?

I'm having some trouble optimizing this part of code.
It works, but seems unnecessary slow.
The function searches after a searchString in a file starting on line line_nr and returns the line number for first hit.
import linecache
def searchStr(fileName, searchString, line_nr = 1, linesInFile):
# The above string is the input to this function
# line_nr is needed to search after certain lines.
# linesInFile is total number of lines in the file.
while line_nr < linesInFile + 1:
line = linecache.getline(fileName, line_nr)
has_match = line.find(searchString)
if has_match >= 0:
return line_nr
break
line_nr += 1
I've tried something along these lines, but never managed to implement the "start on a certain line number"-input.
Edit: The usecase. I'm post processing analysis files containing text and numbers that are split into different sections with headers. The headers on line_nr are used to break out chunks of the data for further processing.
Example of call:
startOnLine = searchStr(fileName, 'Header 1', 1, 10000000):
endOnLine = searchStr(fileName, 'Header 2', startOnLine, 10000000):
Why don't you start with simplest possible implementation ?
def search_file(filename, target, start_at = 0):
with open(filename) as infile:
for line_no, line in enumerate(infile):
if line_no < start_at:
continue
if line.find(target) >= 0:
return line_no
return None
I guess your file is like:
Header1 data11 data12 data13..
name1 value1 value2 value3...
...
...
Header2 data21 data22 data23..
nameN valueN1 valueN2 valueN3..
...
Does the 'Header' string contains any constant formats(i.e: all start with '#' or sth). If so, you can read the line directly, judge if the line contains this format (i.e: if line[0]=='#') and write different code for different kinds of lines(difination line and data line in your example).
Record class:
class Record:
def __init__(self):
self.data={}
self.header={}
def set_header(self, line):
...
def add_data(self, line):
...
iterate part:
def parse(p_file):
record = None
for line in p_file:
if line[0] == "#":
if record : yield record
else:
record = Record()
record.set_header(line)
else:
record.add_data(line)
yield record
main func:
data_file = open(...)
for rec in parse(data_file):
...

file programming in python

read_file = open ('C:\Users\Mahya\Desktop\\automate\Autosupports\\at1.txt','r')
content = read_file.readlines()
for line in content:
if line.contains('===== BOOT TIME STATS ====='):
print found
I want to read '===== BOOT TIME STATS =====' this line and print the lines that are below till next line
please help
I'm guessing you want to print the lines between the first and second occurrences of the given string:
read_file = open ('C:\Users\Mahya\Desktop\\automate\Autosupports\\at1.txt','r')
content = read_file.readlines()
found = False
for line in content:
if line.contains('===== BOOT TIME STATS ====='):
if found:
break # Exit the for loop when finding the string for the second time
found = True
if found:
print line
Without testing:
read_file = open ('C:\Users\Mahya\Desktop\\automate\Autosupports\\at1.txt','r')
content = read_file.readlines()
i_found_the_block = False
for line in content:
if "===== BOOT TIME STATS =====" in line:
print ("found")
i_found_the_block = True
if i_found_the_block:
print line
file = open ('C:\Users\Mahya\Desktop\\automate\Autosupports\\at1.txt','r')
for line in file:
if '===== BOOT TIME STATS =====' in line: break
for line in file:
if 'i wanna stop here' in line: break
print line
def lineRange(lines, start, end):
"""Return all lines from the first line containing start
up to (but not including) the first ensuing line containing end
"""
lines = iter(lines)
# find first occurrence of start
for line in lines:
if start in line:
yield line
break
# continue until first occurrence of end
for line in lines:
if end in line:
break
else:
yield line
def main():
fname = 'C:/Users/Mahya/Desktop/automate/Autosupports/at1.txt'
start = '===== BOOT TIME STATS ====='
end = start # next section header?
with open(fname) as inf:
lr = lineRange(inf, start, end)
try:
lr.next() # skip header
print ''.join(lr)
except StopIteration:
print 'Start string not found'
if __name__=="__main__":
main()

Categories

Resources