Optimizing python file search? - python

I'm having some trouble optimizing this part of code.
It works, but seems unnecessary slow.
The function searches after a searchString in a file starting on line line_nr and returns the line number for first hit.
import linecache
def searchStr(fileName, searchString, line_nr = 1, linesInFile):
# The above string is the input to this function
# line_nr is needed to search after certain lines.
# linesInFile is total number of lines in the file.
while line_nr < linesInFile + 1:
line = linecache.getline(fileName, line_nr)
has_match = line.find(searchString)
if has_match >= 0:
return line_nr
break
line_nr += 1
I've tried something along these lines, but never managed to implement the "start on a certain line number"-input.
Edit: The usecase. I'm post processing analysis files containing text and numbers that are split into different sections with headers. The headers on line_nr are used to break out chunks of the data for further processing.
Example of call:
startOnLine = searchStr(fileName, 'Header 1', 1, 10000000):
endOnLine = searchStr(fileName, 'Header 2', startOnLine, 10000000):

Why don't you start with simplest possible implementation ?
def search_file(filename, target, start_at = 0):
with open(filename) as infile:
for line_no, line in enumerate(infile):
if line_no < start_at:
continue
if line.find(target) >= 0:
return line_no
return None

I guess your file is like:
Header1 data11 data12 data13..
name1 value1 value2 value3...
...
...
Header2 data21 data22 data23..
nameN valueN1 valueN2 valueN3..
...
Does the 'Header' string contains any constant formats(i.e: all start with '#' or sth). If so, you can read the line directly, judge if the line contains this format (i.e: if line[0]=='#') and write different code for different kinds of lines(difination line and data line in your example).
Record class:
class Record:
def __init__(self):
self.data={}
self.header={}
def set_header(self, line):
...
def add_data(self, line):
...
iterate part:
def parse(p_file):
record = None
for line in p_file:
if line[0] == "#":
if record : yield record
else:
record = Record()
record.set_header(line)
else:
record.add_data(line)
yield record
main func:
data_file = open(...)
for rec in parse(data_file):
...

Related

How to delete from specific line until specific line in a file on python

I have a file like this:
a
D
c
a
T
c
a
R
c
I want to delete from specific line (in this case 3) until another specific line (in this case 5), so the file would look like:
a
D
c
a
R
c
I think this should work:
def delete_line_range(filename, start_line, end_line):
# read all lines
with open(filename, 'r') as f:
lines = f.readlines()
f.close()
with open(filename, 'w') as f:
# iterate trough lines
for i in range(len(lines)):
# check if line is out of range
if i < start_line or i > end_line:
f.write(lines[i])
f.close()
return
Or if you want to delete multiple ranges
def delete_line_ranges(filename, line_ranges):
# read all lines
with open(filename, 'r') as f:
lines = f.readlines()
f.close()
with open(filename, 'w') as f:
# iterate trough lines
for i in range(len(lines)):
# check if line is in range
in_range = False
for line_range in line_ranges:
if i >= line_range[0] and i <= line_range[1]:
in_range = True
break
# if not in range, write line
if not in_range:
f.write(lines[i])
f.close()
return
Where line_ranges is a list of tuples containing start and end line numbers.
Be aware that in both of these functions the line numbers start at 0.
Means if you want to delete line 3 to 5 like in your example you need to subtract 1 from both start and end line number.
delete_line_range('test.txt', 2, 4) # deletes line 3 to 5 if you start counting from 1
Edit
Deleting Contacts out of vcf file.
Get range of specific contact:
def get_contact_range(filename, name):
# read all lines
with open(filename, 'r') as f:
lines = f.readlines()
f.close()
for i in range(len(lines)):
# check if line is start of contact
if lines[i].startswith('BEGIN:VCARD'):
start_line = i
continue
if name in lines[i]:
for j in range(i, len(lines)):
# check if line is end of contact
if lines[j].startswith('END:VCARD'):
end_line = j
return (start_line, end_line)
return None
print(get_contact_range('test.vcf', 'John Doe'))
test.vcf
BEGIN:VCARD
VERSION:3.0
PRODID:-//Apple Inc.//macOS 11.5.2//EN
N:Doe;John;;;
FN:John Doe
ORG:Sharpened Productions;
EMAIL;type=INTERNET;type=HOME;type=pref:johndoe#email.com
EMAIL;type=INTERNET;type=WORK:johndoe#workemail.com
TEL;type=CELL;type=VOICE;type=pref:123-456-7890
ADR;type=HOME;type=pref:;;12345 First Avenue;Hometown;NY;12345;United States
ADR;type=WORK:;;67890 Second Avenue;Businesstown;NY;67890;United States
NOTE:The man I met at the company networking event. He mentioned that he had some potential leads.
item1.URL;type=pref:https://fileinfo.com/
item1.X-ABLabel:_$!!$_
BDAY:2000-01-01
END:VCARD
Output:
(0, 15)
Combine the above functions:
def delete_contact(filename, name):
# get start and end line of contact
contact_range = get_contact_range(filename, name)
# delete contact
delete_line_range(filename, contact_range[0], contact_range[1])
return
Multiple Contacts at once:
def delete_contacts(filename, names):
# get start and end line of contacts
line_ranges = []
for name in names:
contact_range = get_contact_range(filename, name)
line_ranges.append((contact_range[0], contact_range[1]))
# delete contacts
delete_line_ranges(filename, line_ranges)
return

python3: Read huge file (~ 800 Gb), split the lines by condition, and append them to the end of new files in a memory-efficient way

I’m learning python 3, and I’m dealing with a huge txt file (~800Gb).
The enclosed function 'kmers_dic' while it read the main file, if the condition in if statement is satisfied, it should append the line in the previously created files (these files are 1024 and they are named with content of the kmers variable). The function work fine with a subset of the principal file, but when I run the code using the main file, my job is killed because I reached a memory usage limit.
def OpenFiles(i):
'''
A switch to handle file opening and reduce duplicated code
'''
open_method = {
"gz": gzip.open,
"norm": open
}
return open_method[i]
def rows(f, chunksize=102400, sep='\n'):
"""
Read a file where the row separator is '\n' lazily.
Default chunk size: 102400kB 100Mb.
Usage:
>>> with open('big.csv') as f:
>>> for r in rows(f):
>>> process(r)
"""
curr_row = ''
while True:
chunk = f.read(chunksize)
if chunk == '': # End of file
break
while True:
i = chunk.find(sep)
if i == -1:
break
yield curr_row + chunk[:i]
curr_row = ''
chunk = chunk[i+1:]
curr_row += chunk
def kmers_dic(input_file,kmers,out_dir):
'''
file writing by kmers
'''
#kmers_dic = set()
count_line=0
count_line_1=0
if input_file.endswith('.gz'):
nano_read = OpenFiles('gz')
else:
nano_read = OpenFiles('norm')
with nano_read(input_file, 'rt') as nano_f:
chunk = rows(nano_f,chunksize=2024,sep='\n')
for line in chunk:
count_line+=1
count_line_1+=1
sys.stdout.write('%s\r' % count_line)
sys.stdout.flush()
line = line.strip('\n')
line = line.split()
if line[2] in kmers:
kmer = line[2]
Out_f_name = out_dir+line[2]+'.lib'
file1 = open(Out_f_name, 'a')
##file1.write('\t'.join(line) + '\n') # print entire line
file1.write('\t'.join(line[1:4:]+line[6:9:]+line[9:13:]+line[15:]) + '\n')
file1.close()
print("lines: ",count_line_1)
I'm not understanding where is the issue.
Can you help me ?
Thanks in advance!
Best.
curr_row += chunk causes you keep all chunks in memory until you run out of free memory.

read file block by block in python

I have file that has this format
MACRO L20_FDPQ_TV1T8_1
FIXEDMASK ;
CLASS CORE ;
...
...
END L20_FDPQ_TV1T8_1
MACRO INV_20
...
...
END INV_20
I want to read the file as blocks such that each MACRO to end of its name form a block in python. I tried to use this
with open(file_name, "r") as f_read:
lines = f_read.readlines()
num = 0
while num < len(lines):
line = lines[num]
if re.search(r"MACRO\s+", line, re.IGNORECASE):
macro_name = line.split()[1]
while re.search(r"\s+END\s+%s"%macro_name, line, re.IGNORECASE) is None:
line = lines[num + 1]
#...do something...
num = num+1
num +=1
how this can be done in an efficient way?
Assumming that you cannot nest Macros, that Macros always start with "MACRO [name]" and end with "END [name]":
# read the contents of the file
with open(file_name, "r") as f_read:
lines = f_read.readlines()
current_macro_lines = []
for line in lines:
if line.startswith("MACRO"):
macro_name = line.split()[1]
# if line starts with END and refers to the current macro name
elif line.startswith("END") and line.split()[1] == macro_name:
# here the macro is complete,
#put the code you want to execute when you find the end of the macro
# then when you finish processing the lines,
# empty them so you can accumulate the next macro
current_macro_lines = []
else:
# here you can accumulate the macro lines
current_macro_lines.append(line)

Split large text file(around 50GB) into multiple files

I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-
I've to open the file
then using readline() have to read the file line by line and write at the same time to a new file
and as soon as it hits the maximum number of lines it will create another file and
starts writing again.
I'm wondering, how to put all these steps together in a memory efficient and faster way. I've seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.
This working solution uses split command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.
First, I created a test file with 1000M entries (15 GB) with
awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
Then I used split:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
It took 5 min to produce a set of 34 small files with names t00-t33. 33 files are 458 MB each and the last t33 is 153 MB.
from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)
I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}\n'.format(line))
If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.
This class may solve your problem.
I've tested it on Linux and Windows operating system, and it's worked perfectly on both of them.
Also, I've tested binary and text file with different sizes each time and it was great.
Enjoy :)
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "\r\n" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("\rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()
I am writing a Python3 code solution which I usually use to split files having size in MBs.
However, I have not yet tried for files having size in GBs.
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increment file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"\n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :
1.txt
2.txt
3.txt
.
.
.
.
n.txt

python record separtor file iteration

I have a very very large text file (much larger than can fit in memory). What I would like to do is use something similar to:
for record in myFile:
process_record();
with the added trick that my records are separated by blank lines (with all kinds of stuff in between). For example...
data1
data2,data3,moredata
anotherrecord,otherstuff
yippee
kaiyay
mom
aThird,record:here
How would one iterate through the file in python where each loop iteration accesses a single record from the file?
You can do it with a generator function:
def records(textfile):
record_lines = []
for line in textfile:
if line != '\n':
record_lines.append(line)
else:
yield ''.join(record_lines)
record_lines = []
yield ''.join(record_lines)
for record in records(the_file):
process(record)
You could create an iterator that joins the lines until you find a blank line.
class MyIter:
def __init__(self, infile):
self.infile=infile
def __iter__(self):
return self
def next(self):
lines = []
for line in infile:
line = line.strip()
if len(line) > 0:
lines.append(line)
else:
break
if len(lines)==0:
raise StopIteration
else:
return ",".join(lines)
and try it with
for line in MyIter(infile):
print line

Categories

Resources