Extract Values between two strings in a text file using python - python

Lets say I have a Text file with the below content
fdsjhgjhg
fdshkjhk
Start
Good Morning
Hello World
End
dashjkhjk
dsfjkhk
Now I need to write a Python code which will read the text file and copy the contents between Start and end to another file.
I wrote the following code.
inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer = []
keepCurrentSet = True
for line in inFile:
buffer.append(line)
if line.startswith("Start"):
#---- starts a new data set
if keepCurrentSet:
outFile.write("".join(buffer))
#now reset our state
keepCurrentSet = False
buffer = []
elif line.startswith("End"):
keepCurrentSet = True
inFile.close()
outFile.close()
I'm not getting the desired output as expected
I'm just getting Start
What I want to get is all the lines between Start and End.
Excluding Start & End.

Just in case you have multiple "Start"s and "End"s in your text file, this will import all the data together, excluding all the "Start"s and "End"s.
with open('path/to/input') as infile, open('path/to/output', 'w') as outfile:
copy = False
for line in infile:
if line.strip() == "Start":
copy = True
continue
elif line.strip() == "End":
copy = False
continue
elif copy:
outfile.write(line)

If the text files aren't necessarily large, you can get the whole content of the file then use regular expressions:
import re
with open('data.txt') as myfile:
content = myfile.read()
text = re.search(r'Start\n.*?End', content, re.DOTALL).group()
with open("result.txt", "w") as myfile2:
myfile2.write(text)

I'm not a Python expert, but this code should do the job.
inFile = open("data.txt")
outFile = open("result.txt", "w")
keepCurrentSet = False
for line in inFile:
if line.startswith("End"):
keepCurrentSet = False
if keepCurrentSet:
outFile.write(line)
if line.startswith("Start"):
keepCurrentSet = True
inFile.close()
outFile.close()

Using itertools.dropwhile, itertools.takewhile, itertools.islice:
import itertools
with open('data.txt') as f, open('result.txt', 'w') as fout:
it = itertools.dropwhile(lambda line: line.strip() != 'Start', f)
it = itertools.islice(it, 1, None)
it = itertools.takewhile(lambda line: line.strip() != 'End', it)
fout.writelines(it)
UPDATE: As inspectorG4dget commented, above code copies over the first block. To copy multiple blocks, use following:
import itertools
with open('data.txt', 'r') as f, open('result.txt', 'w') as fout:
while True:
it = itertools.dropwhile(lambda line: line.strip() != 'Start', f)
if next(it, None) is None: break
fout.writelines(itertools.takewhile(lambda line: line.strip() != 'End', it))

Move the outFile.write call into the 2nd if:
inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer = []
for line in inFile:
if line.startswith("Start"):
buffer = ['']
elif line.startswith("End"):
outFile.write("".join(buffer))
buffer = []
elif buffer:
buffer.append(line)
inFile.close()
outFile.close()

import re
inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer1 = ""
keepCurrentSet = True
for line in inFile:
buffer1=buffer1+(line)
buffer1=re.findall(r"(?<=Start) (.*?) (?=End)", buffer1)
outFile.write("".join(buffer1))
inFile.close()
outFile.close()

I would handle it like this :
inFile = open("data.txt")
outFile = open("result.txt", "w")
data = inFile.readlines()
outFile.write("".join(data[data.index('Start\n')+1:data.index('End\n')]))
inFile.close()
outFile.close()

if one wants to keep the start and end lines/keywords while extracting the lines between 2 strings.
Please find below the code snippet that I used to extract sql statements from a shell script
def process_lines(in_filename, out_filename, start_kw, end_kw):
try:
inp = open(in_filename, 'r', encoding='utf-8', errors='ignore')
out = open(out_filename, 'w+', encoding='utf-8', errors='ignore')
except FileNotFoundError as err:
print(f"File {in_filename} not found", err)
raise
except OSError as err:
print(f"OS error occurred trying to open {in_filename}", err)
raise
except Exception as err:
print(f"Unexpected error opening {in_filename} is", repr(err))
raise
else:
with inp, out:
copy = False
for line in inp:
# first IF block to handle if the start and end on same line
if line.lstrip().lower().startswith(start_kw) and line.rstrip().endswith(end_kw):
copy = True
if copy: # keep the starts with keyword
out.write(line)
copy = False
continue
elif line.lstrip().lower().startswith(start_kw):
copy = True
if copy: # keep the starts with keyword
out.write(line)
continue
elif line.rstrip().endswith(end_kw):
if copy: # keep the ends with keyword
out.write(line)
copy = False
continue
elif copy:
# write
out.write(line)
if __name__ == '__main__':
infile = "/Users/testuser/Downloads/testdir/BTEQ_TEST.sh"
outfile = f"{infile}.sql"
statement_start_list = ['database', 'create', 'insert', 'delete', 'update', 'merge', 'delete']
statement_end = ";"
process_lines(infile, outfile, tuple(statement_start_list), statement_end)

Files are iterators in Python, so this means you don't need to hold a "flag" variable to tell you what lines to write. You can simply use another loop when you reach the start line, and break it when you reach the end line:
with open("data.txt") as in_file, open("result.text", 'w') as out_file:
for line in in_file:
if line.strip() == "Start":
for line in in_file:
if line.strip() == "End":
break
out_file.write(line)

Related

Deleting a line from a Python text file

While I was learning how to work with files in Python, I had a question: How can you delete a line from a file that contains a specific word. I wrote the following code:
arr = []
try:
with open("test.txt") as file:
arr = file.readlines()
except FileNotFoundError:
print("File not found!")
word = "five"
try:
with open("test.txt", "w") as file:
for row in arr:
if word not in row:
file.write(row)
except FileNotFoundError:
print("File not found!")
But I would like to know if it is possible to do this without writing all the lines in one array, because the file can sometimes be very large and there can be a lack of memory.
You just need to do the following:
open it in read and write mode.
Read it and filter the line out
move the cursor to the begin of the file
write it back
remove anything than is after it
with open("test.txt", "r+") as f:
output = filter(lambda line: word not in line, f.readlines())
f.seek(0)
f.write(''.join(output))
f.truncate()
To improve memory usage, just read the file line by line using f.readline() and adjust the seek dinamically.
with open("test.txt", "r+") as f:
latest_written_position = f.tell()
line = f.readline()
reading_position = f.tell()
while line:
if word not in line:
f.seek(latest_written_position)
f.write(line)
latest_written_position = f.tell()
f.seek(reading_position)
line = f.readline()
reading_position = f.tell()
f.seek(latest_written_position)
f.truncate()
reading a file line by line and writing it to the same file, without storing all the lines in memory:
word = "five"
try:
with open("test.txt", "r") as file_in:
with open("test.txt", "w") as file_out:
for line in file_in:
if word not in line:
file_out.write(line)
except FileNotFoundError:
print("File not found!")
This should do the trick:
try:
with open("test.txt") as file:
arr = file.readlines()
except IOError:
from sys import exit as exit_the_progrem
try:
exit_the_progrem(__status=-1)
except:
import sys
exiter = sys
del sys
exiter.exit()
finally:
try:
condtion = True
arr = ['hello', 'world'] ## or comment?
locl_i = len(arr)
for i, elem in enumerate(reversed(arr), start=1):
locl_i -= i
if condtion: # example
del i
if not locl_i:
del locl_i
del elem
del arr[locl_i]
print(arr)
with open('out_f.xtxt', 'w') as of:
of.write(arr)
except Exception as __this_thing_went_wrong:
print(__this_thing_went_wrong)
from sys import exit as exit_the_progrem
success = 123
exit_the_progrem(~success)
You can write as you read instead of storing
Consider the following:
try:
with open("test.txt") as file:
with open("new_test.txt", "w") as new_file:
for row in file:
if word not in row:
new_file.write(row)
except FileNotFoundError:
print("File not found!")

Function that reads last line of file?

The function reads the last line of the file at the specified file path. The function returns the last line of the file as a string, if the file is empty it will return an empty string ("").
I tried writing my code like this but it won't work, it's pretty messy and I'm a beginner
def read_last_line(file_path):
with open(file_path, 'r') as file:
size_file = os.path.getsize(file_path)
return_file_empty = " "
last_line = (list(file)[-1])
print(last_line)
if size_file == 0:
return return_file_empty
else:
return last_line
you can use:
def read_last_line(file_path):
with open(file_path) as f:
lines = f.readlines()
return lines[-1] if lines else ''
for big files you may use:
def read_last_line(file_path):
with open(file_path, 'r') as f:
last_line = ''
for line in f:
last_line = line
return last_line
This opens the file and moves though it until there is no more file (raises StopIteration) and returns the last line.
def read_last_line(filename):
line = ""
with open(filename) as fh:
while True:
try:
line = next(fh)
except StopIteration:
return line
You can use a collections.deque to get it like the following. Unlike the currently accepted answer, doesn't require storing the entire file in memory:
from collections import deque
def get_last_line(filename):
with open(filename, 'r') as f:
try:
lastline = deque(f, 1)[0]
except IndexError: # Empty file.
lastline = None
return lastline
print('last line: {}'.format(get_last_line(filename)))
If I've understood the question correctly, something like this maybe?
def get_last_line(file_path):
with open(file_path, "r") as file:
return next(line for line in reversed(file.read().splitlines()) if line)

Text file not being written, python3

def getlink():
with open('findlink.txt') as infile, open('extractlink.txt', 'w') as outfile:
copy = False
for line in infile:
if line.strip() == "](":
copy = True
if copy:
outfile.write(line)
if line.strip() == ")":
copy = False
print("extractlink written.")
infile.close()
outfile.close()
def part3():
with open ('findlink.txt', 'w') as findlink:
findlink.write("[Testing](Test)")
findlink.close()
print("findlink written and closed.")
getlink()
def run_bot():
getlink() #Already have findlink.txt written
When part3() is activated, the text is written to findlink.txt as expected, but when getlink() is activated, the extractlink.txt is never written to.
I've gathered my current code from a post back in 2013/2016, does anyone have any ideas why this may not be working?
try below code:
def getlink():
with open('findlink.txt') as infile:
data = infile.readlines()
with open('extractlink.txt', 'w') as outfile:
copy = False
for line in data:
if line.strip() == "](":
copy = True
if copy:
outfile.write(line)
if line.strip() == ")":
copy = False
print("extractlink written.")
outfile.close()
extractlink()
return

Get all the lines below certain words, until there's a space [duplicate]

Lets say I have a Text file with the below content
fdsjhgjhg
fdshkjhk
Start
Good Morning
Hello World
End
dashjkhjk
dsfjkhk
Now I need to write a Python code which will read the text file and copy the contents between Start and end to another file.
I wrote the following code.
inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer = []
keepCurrentSet = True
for line in inFile:
buffer.append(line)
if line.startswith("Start"):
#---- starts a new data set
if keepCurrentSet:
outFile.write("".join(buffer))
#now reset our state
keepCurrentSet = False
buffer = []
elif line.startswith("End"):
keepCurrentSet = True
inFile.close()
outFile.close()
I'm not getting the desired output as expected
I'm just getting Start
What I want to get is all the lines between Start and End.
Excluding Start & End.
Just in case you have multiple "Start"s and "End"s in your text file, this will import all the data together, excluding all the "Start"s and "End"s.
with open('path/to/input') as infile, open('path/to/output', 'w') as outfile:
copy = False
for line in infile:
if line.strip() == "Start":
copy = True
continue
elif line.strip() == "End":
copy = False
continue
elif copy:
outfile.write(line)
If the text files aren't necessarily large, you can get the whole content of the file then use regular expressions:
import re
with open('data.txt') as myfile:
content = myfile.read()
text = re.search(r'Start\n.*?End', content, re.DOTALL).group()
with open("result.txt", "w") as myfile2:
myfile2.write(text)
I'm not a Python expert, but this code should do the job.
inFile = open("data.txt")
outFile = open("result.txt", "w")
keepCurrentSet = False
for line in inFile:
if line.startswith("End"):
keepCurrentSet = False
if keepCurrentSet:
outFile.write(line)
if line.startswith("Start"):
keepCurrentSet = True
inFile.close()
outFile.close()
Using itertools.dropwhile, itertools.takewhile, itertools.islice:
import itertools
with open('data.txt') as f, open('result.txt', 'w') as fout:
it = itertools.dropwhile(lambda line: line.strip() != 'Start', f)
it = itertools.islice(it, 1, None)
it = itertools.takewhile(lambda line: line.strip() != 'End', it)
fout.writelines(it)
UPDATE: As inspectorG4dget commented, above code copies over the first block. To copy multiple blocks, use following:
import itertools
with open('data.txt', 'r') as f, open('result.txt', 'w') as fout:
while True:
it = itertools.dropwhile(lambda line: line.strip() != 'Start', f)
if next(it, None) is None: break
fout.writelines(itertools.takewhile(lambda line: line.strip() != 'End', it))
Move the outFile.write call into the 2nd if:
inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer = []
for line in inFile:
if line.startswith("Start"):
buffer = ['']
elif line.startswith("End"):
outFile.write("".join(buffer))
buffer = []
elif buffer:
buffer.append(line)
inFile.close()
outFile.close()
import re
inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer1 = ""
keepCurrentSet = True
for line in inFile:
buffer1=buffer1+(line)
buffer1=re.findall(r"(?<=Start) (.*?) (?=End)", buffer1)
outFile.write("".join(buffer1))
inFile.close()
outFile.close()
I would handle it like this :
inFile = open("data.txt")
outFile = open("result.txt", "w")
data = inFile.readlines()
outFile.write("".join(data[data.index('Start\n')+1:data.index('End\n')]))
inFile.close()
outFile.close()
if one wants to keep the start and end lines/keywords while extracting the lines between 2 strings.
Please find below the code snippet that I used to extract sql statements from a shell script
def process_lines(in_filename, out_filename, start_kw, end_kw):
try:
inp = open(in_filename, 'r', encoding='utf-8', errors='ignore')
out = open(out_filename, 'w+', encoding='utf-8', errors='ignore')
except FileNotFoundError as err:
print(f"File {in_filename} not found", err)
raise
except OSError as err:
print(f"OS error occurred trying to open {in_filename}", err)
raise
except Exception as err:
print(f"Unexpected error opening {in_filename} is", repr(err))
raise
else:
with inp, out:
copy = False
for line in inp:
# first IF block to handle if the start and end on same line
if line.lstrip().lower().startswith(start_kw) and line.rstrip().endswith(end_kw):
copy = True
if copy: # keep the starts with keyword
out.write(line)
copy = False
continue
elif line.lstrip().lower().startswith(start_kw):
copy = True
if copy: # keep the starts with keyword
out.write(line)
continue
elif line.rstrip().endswith(end_kw):
if copy: # keep the ends with keyword
out.write(line)
copy = False
continue
elif copy:
# write
out.write(line)
if __name__ == '__main__':
infile = "/Users/testuser/Downloads/testdir/BTEQ_TEST.sh"
outfile = f"{infile}.sql"
statement_start_list = ['database', 'create', 'insert', 'delete', 'update', 'merge', 'delete']
statement_end = ";"
process_lines(infile, outfile, tuple(statement_start_list), statement_end)
Files are iterators in Python, so this means you don't need to hold a "flag" variable to tell you what lines to write. You can simply use another loop when you reach the start line, and break it when you reach the end line:
with open("data.txt") as in_file, open("result.text", 'w') as out_file:
for line in in_file:
if line.strip() == "Start":
for line in in_file:
if line.strip() == "End":
break
out_file.write(line)

Write to file without redundant lines in python

I'm writing python script to read line from a input file and write a unique lines(if the same line is not already in output file) to output file. somehow, my scripts always append the first line of input file to output file even if the same line is already in output file. I can't figure out why this happens.
can anyone know why and how do I fix this?
thanks,
import os
input_file= 'input.txt'
output_file = 'output.txt'
fo = open(output_file, 'a+')
flag = False
with open(input_file, 'r') as fi:
for line1 in fi:
print line1
for line2 in fo:
print line2
if line2 == line1:
flag = True
print('Found Match!!')
break
if flag == False:
fo.write(line1)
elif flag == True:
flag == False
fo.seek(0)
fo.close()
fi.close()
When you open a file in append mode, the file object position is at the end of the file. So the first time through, when it reaches for line2 in fo:, there aren't any more lines in fo, so that block is skipped, and flag is still true, so that first line is written to the output file. After that, you do fo.seek(0), so you are checking against the entire file for subsequent lines.
The answer by kmacinnis is right on as to why your code isn't working; you need to use mode 'r+' instead of 'a+', or else put fo.seek(0) at the beginning of the for loop instead of the end.
That said, there's a much better way to do this than reading the entire output file for every line of the input file.
def ensure_file_ends_with_newline(handle):
position = handle.tell()
handle.seek(-1, 2)
handle_end = handle.read(1)
if handle_end != '\n':
handle.write('\n')
handle.seek(position)
input_filepath = 'input.txt'
output_filepath = 'output.txt'
with open(input_file, 'r') as infile, open(output_file, 'r+') as outfile:
ensure_file_ends_with_newline(outfile)
written = set(outfile)
for line in infile:
if line not in written:
outfile.write(line)
written.add(line)
Your flag was never set to False.
flag == True is an equality
flag = True is an assignment.
Try the latter.
import os
input_file= 'input.txt'
output_file = 'output.txt'
fo = open(output_file, 'a+')
flag = False
with open(input_file, 'r') as fi:
for line1 in fi:
#print line1
for line2 in fo:
#print line2
if line2 == line1:
flag = True
print('Found Match!!')
print (line1,line2)
break
if flag == False:
fo.write(line1)
elif flag == True:
flag = False
fo.seek(0)

Categories

Resources