Python to read txt files and delete lines that contains same part - python

I have a tons (1000+) of txt files that looks like this
TextTextText('aaa/bbb`ccc' , "ddd.eee");
TextTextText('yyy/iii`ooo' , "rrr.ttt");
TextTextText('aaa/fff`ggg' , "hhh.jjj");
What I want to achieve is to delete all lines that contains same "aaa" part, and leave only one line with it (remove all duplicates).
my code so far:
import os
from collections import Counter
sourcepath = os.listdir('Process_Directory3/')
for file in sourcepath:
inputfile = 'Process_Directory3/' + file
outputfile = "Output_Directory/" + file
lines_seen = set()
outfile = open(outputfile, "w")
for line in open(inputfile, "r"):
print(line)
cut_line = line.split("'")
new_line = cut_line[1]
cut_line1 = new_line.split("/")
new_line1 = cut_line1[0]
if new_line1 not in lines_seen:
outfile.write(new_line1)
lines_seen.add(new_line1)
outfile.close()
My code is not working at all, I dont get any results
Console Report:
Line13 in <module>
new_line = cut_line[1]
IndexError: list index out of range
Sorry for my bad writing, it's my first post so far :D
Best Regards
Update:
I added
startPattern = "TextTextText"
if(startPattern in line):
to make sure i target only lines that begins with "TextTextText", but for some reason I am getting .txt in destination folder that contains only 1 line of content "aaa".
In the end of the day, here is a fully working code:
import os
sourcepath = os.listdir('Process_Directory3/')
for file in sourcepath:
inputfile = 'Process_Directory3/' + file
outputfile = "Output_Directory/" + file
lines_seen = set()
outfile = open(outputfile, "w")
for line in open(inputfile, "r"):
if line.startswith("TextTextText"):
try:
cut_line = line.split("'")
new_line = cut_line[1]
cut_line1 = new_line.split("/")
new_line1 = cut_line1[0]
if new_line1 not in lines_seen:
outfile.write(line)
lines_seen.add(new_line1)
except:
pass
else:
outfile.write(line)
outfile.close()
Thanks for a great help guys!

Use a try-except block in inner for loop. This will prevent your program from being interrupted if any error is encountered due to any line which doesn't contain ' or /.
Update:
I've tried the code given below and it worked fine for me.
sourcepath = os.listdir('Process_Directory3/')
for file in sourcepath:
inputfile = 'Process_Directory3/' + file
outputfile = "Output_Directory/" + file
lines_seen = set()
outfile = open(outputfile, "w")
for line in open(inputfile, "r"):
try:
cut_line = line.split("'")
new_line = cut_line[1]
cut_line1 = new_line.split("/")
new_line1 = cut_line1[0]
if new_line1 not in lines_seen:
outfile.write(line) # Replaced new_line1 with line
lines_seen.add(new_line1)
except:
pass
outfile.close() # This line was having bad indentation
Input file:
TextTextText('aaa/bbb`ccc' , "ddd.eee");
TextTextText('yyy/iii`ooo' , "rrr.ttt");
TextTextText('aaa/fff`ggg' , "hhh.jjj");
TextTextText('WWW/fff`ggg' , "hhh.jjj");
TextTextText('yyy/iii`ooo' , "rrr.ttt");
Output File:
TextTextText('aaa/bbb`ccc' , "ddd.eee");
TextTextText('yyy/iii`ooo' , "rrr.ttt");
TextTextText('WWW/fff`ggg' , "hhh.jjj");

It looks like you encountered line inside your file which has not ', in this case line.split("'") produce list with single element, for example
line = "blah blah blah"
cut_line = line.split("'")
print(cut_line) # ['blah blah blah']
so trying to get cut_line[1] result in error as there is only cut_line[0]. As this piece of your code is inside loop you might avoid that by skipping to next iteration using continue word, if cut_line has not enough elements, just replace:
cut_line = line.split("'")
new_line = cut_line[1]
by:
cut_line = line.split("'")
if len(cut_line) < 2:
continue
new_line = cut_line[1]
This will result in ignoring all lines without '.

I think using a regular expression would make it easier. I have made a simplified working code using re.
import re
lines = [
"",
"dfdsa sadfsadf sa",
"TextTextText('aaa/bbb`ccc' ,dsafdsafsA ",
"TextTextText('yyy/iii`ooo' ,SDFSDFSDFSA ",
"TextTextText('aaa/fff`ggg' ,SDFSADFSDF ",
]
lines_seen = set()
out_lines = []
for line in lines:
# SEARCH FOR 'xxx/ TEXT in the line -----------------------------------
re_result = re.findall(r"'[a-z]+\/", line)
if re_result:
print(f're_result {re_result[0]}')
if re_result[0] not in lines_seen:
print(f'>>> newly found {re_result[0]}')
lines_seen.add(re_result[0])
out_lines.append(line)
print('------------')
for line in out_lines:
print(line)
Result
re_result 'aaa/
>>> newly found 'aaa/
re_result 'yyy/
>>> newly found 'yyy/
re_result 'aaa/
------------
TextTextText('aaa/bbb`ccc' ,dsafdsafsA
TextTextText('yyy/iii`ooo' ,SDFSDFSDFSA
You can experiment with regular expressions here regex101.com.
Try r"'.+/" any character between ' and /, or r"'[a-zA-Z]+/" lower and uppercase letters between ' and /.

Related

Replace space with underscore using python

I need some help, I have a a txt file with spaces between words, I want to replace the space with underscore.
fileHandler = open('nog_rename_update.txt')
for eachline in fileHandler:
new_name = fileHandler.replace(" ","_")
print(new_name)
That's my code but it keeps throwing error messages
new_name = fileHandler.replace(" ","_")
AttributeError: '_io.TextIOWrapper' object has no attribute 'replace'
example files that I want to remove space and add underscore
Here's a generic approach that should work for you:
teststring = 'hello world this is just a test. don\'t mind me 123.'
# replace multiple spaces with one space
while ' ' in teststring:
teststring = teststring.replace(' ', ' ')
# replace space with underscore (_)
teststring = teststring.replace(' ', '_')
print(teststring)
assert teststring == "hello_world_this_is_just_a_test._don't_mind_me_123." # True
Using a file example:
fname = 'mah_file.txt'
with open(fname) as in_file:
contents = in_file.read()
while ' ' in contents:
contents = contents.replace(' ', ' ')
# write updated contents back to file
with open(fname, 'w') as out_file:
out_file.write(contents.replace(' ', '_'))
This opens the files, reads line by line, splits the line into two parts, and combines the two parts with an underscore. I stored it in a list that you can use to do your next step.
with open('nog_rename_update.txt') as f:
new_list = []
for line in f:
# split the line
split = line.split()
new_list.append(split[0]+"_"+split[1])
# print the list to see results
print(new_list)
#
# add code to loop through the new list and to write to a file
#
Try out this
fileHandler = open('nog_rename_update.txt').read()
new_name = fileHandler.replace(" ", "_")
print(new_name)
f = open("test.txt", "r")
text=f.read()
f.close()
f=open("testfile.txt", "w+")
text2=''
if ' ' in text:
text2 = text.replace(' ' , '_')
print(text2)
f.write(text2)
f.close()
Here is another, less verbose solution. Simply use re.sub:
import re
file_name = r"D:\projects\playground\python\data\test.txt"
with open(file_name, "r") as file:
for line in file:
print(re.sub("( )+", "_", line), end="")
And if you want to replace the spaces in your text file:
import re
file_name = r"D:\projects\playground\python\data\test.txt"
lines = []
with open(file_name, "r") as file:
lines = [
re.sub("( )+", "_", line) for line in file.readlines()
]
with open(file_name, "w") as file:
file.writelines(lines)
Or use fileinput:
import re
import fileinput
file_name = r"D:\projects\playground\python\data\test.txt"
with fileinput.FileInput(file_name, inplace=True, backup=".bak") as file:
for line in file:
print(re.sub("( )+", "_", line), end="")

How to read line in text file and replace the whole line in Python?

I want to replace a whole line in a text document, if there is a line that begins with "truck_placement"
Can I remove the whole line when it contains "truck_placement" and then write the new text?
I tried it but it only inserts the new text und doesn't replace the whole line.
Thats the current code:
cordget = coordinatesentry.get()
fin = open(save_file,"r")
filedata = fin.read()
fin.close
newdata = filedata.replace("truck_placement: " , "truck_placement: " + cordget)
fin = open(save_file, "w")
fin.write(newdata)
fin.close
Your best bet is to append all the lines without "truck_placement" to a new file. This can be done with the following code:
original = open("truck.txt","r")
new = open("new_truck.txt","a")
for line in original:
if "truck_placement" not in line:
new.write(line)
original.close()
new.close()
You can either read the whole file into one string and replace the line using regular expression:
import re
cordget = "(value, one) (value, two)"
save_file = "sample.txt"
with open(save_file, "r") as f:
data = f.read()
# Catch the line from "truck_placement: " until the newline character ('\n')
# and replace it with the second argument, where '\1' the catched group
# "truck_placement: " is.
data = re.sub(r'(truck_placement: ).*\n', r'\1%s\n' % cordget, data)
with open(save_file, "w") as f:
f.writelines(data)
Or you could read the file as a list of all lines and overwrite the specific line:
cordget = "(value, one) (value, two)"
save_file = "sample.txt"
with open(save_file, "r") as f:
data = f.readlines()
for index, line in enumerate(data):
if "truck_placement" in line:
data[index] = f"truck_placement: {cordget}\n"
with open(save_file, "w") as f:
f.writelines(data)

Find a dot in a text file and add a newline to the file in Python?

I read from a file, if it finds a ".", it should add a newline "\n" to the text and write it back to the file. I tried this code but still have the problem.
inp = open('rawCorpus.txt', 'r')
out = open("testFile.text", "w")
for line in iter(inp):
l = line.split()
if l.endswith(".")
out.write("\n")
s = '\n'.join(l)
print(s)
out.write(str(s))
inp.close()
out.close()
Try This ( Normal way ):
with open("rawCorpus.txt", 'r') as read_file:
raw_data = read_file.readlines()
my_save_data = open("testFile.text", "a")
for lines in raw_data:
if "." in lines:
re_lines = lines.replace(".", ".\r\n")
my_save_data.write(re_lines)
else:
my_save_data.write(lines + "\n")
my_save_data.close()
if your text file is not big you can try this too :
with open("rawCorpus.txt", 'r') as read_file:
raw_data = read_file.read()
re_data = raw_data.replace(".", ".\n")
with open("testFile.text", "w") as save_data:
save_data.write(re_data)
UPDATE ( output new lines depends on your text viewer too! because in some text editors "\n" is a new line but in some others "\r\n" is a new line. ) :
input sample :
This is a book. i love it.
This is a apple. i love it.
This is a laptop. i love it.
This is a pen. i love it.
This is a mobile. i love it.
Code:
last_buffer = []
read_lines = [line.rstrip('\n') for line in open('input.txt')]
my_save_data = open("output.txt", "a")
for lines in read_lines:
re_make_lines = lines.split(".")
for items in re_make_lines:
if items.replace(" ", "") == "":
pass
else:
result = items.strip() + ".\r\n"
my_save_data.write(result)
my_save_data.close()
Ouput Will Be :
This is a book.
i love it.
This is a apple.
i love it.
This is a laptop.
i love it.
This is a pen.
i love it.
This is a mobile.
i love it.
You are overwriting the string s in every loop with s = '\n'.join(l).
Allocate s = '' as empty string before the for-loop and add the new lines during every loop, e.g. with s += '\n'.join(l) (short version of s = s + '\n'.join(l)
This should work:
inp = open('rawCorpus.txt', 'r')
out = open('testFile.text', 'w')
s = '' # empty string
for line in iter(inp):
l = line.split('.')
s += '\n'.join(l) # add new lines to s
print(s)
out.write(str(s))
inp.close()
out.close()
Here is my own solution, but still I want one more newline after ".", that this solution not did this
read_lines = [line.rstrip('\n') for line in open('rawCorpus.txt')]
words = []
my_save_data = open("my_saved_data.txt", "w")
for lines in read_lines:
words.append(lines)
for word in words:
w = word.rstrip().replace('.', '\n.')
w = w.split()
my_save_data.write(str("\n".join(w)))
print("\n".join(w))
my_save_data.close()

Match the last word and delete the entire line

Input.txt File
12626232 : Bookmarks
1321121:
126262
Here 126262: can be anything text or digit, so basically will search for last word is : (colon) and delete the entire line
Output.txt File
12626232 : Bookmarks
My Code:
def function_example():
fn = 'input.txt'
f = open(fn)
output = []
for line in f:
if not ":" in line:
output.append(line)
f.close()
f = open(fn, 'w')
f.writelines(output)
f.close()
Problem: When I match with : it remove the entire line, but I just want to check if it is exist in the end of line and if it is end of the line then only remove the entire line.
Any suggestion will be appreciated. Thanks.
I saw as following but not sure how to use it in here
a = "abc here we go:"
print a[:-1]
I believe with this you should be able to achieve what you want.
with open(fname) as f:
lines = f.readlines()
for line in lines:
if not line.strip().endswith(':'):
print line
Here fname is the variable pointing to the file location.
You were almost there with your function. You were checking if : appears anywhere in the line, when you need to check if the line ends with it:
def function_example():
fn = 'input.txt'
f = open(fn)
output = []
for line in f:
if not line.strip().endswith(":"): # This is what you were missing
output.append(line)
f.close()
f = open(fn, 'w')
f.writelines(output)
f.close()
You could have also done if not line.strip()[:-1] == ':':, but endswith() is better suited for your use case.
Here is a compact way to do what you are doing above:
def function_example(infile, outfile, limiter=':'):
''' Filters all lines in :infile: that end in :limiter:
and writes the remaining lines to :outfile: '''
with open(infile) as in, open(outfile,'w') as out:
for line in in:
if not line.strip().endswith(limiter):
out.write(line)
The with statement creates a context and automatically closes files when the block ends.
To search if the last letter is : Do following
if line.strip().endswith(':'):
...Do Something...
You can use a regular expression
import re
#Something end with ':'
regex = re.compile('.(:+)')
new_lines = []
file_name = "path_to_file"
with open(file_name) as _file:
lines = _file.readlines()
new_lines = [line for line in lines if regex.search(line.strip())]
with open(file_name, "w") as _file:
_file.writelines(new_lines)

Searching and extracting WH-word from a file line by line with Python and regex

I have a file that has one sentence per line. I am trying to read the file and search if the sentence is a question using regex and extract the wh-word from the sentences and save them back into another file according the order it appeared in the first file.
This is what I have so far..
def whWordExtractor(inputFile):
try:
openFileObject = open(inputFile, "r")
try:
whPattern = re.compile(r'(.*)who|what|how|where|when|why|which|whom|whose(\.*)', re.IGNORECASE)
with openFileObject as infile:
for line in infile:
whWord = whPattern.search(line)
print whWord
# Save the whWord extracted from inputFile into another whWord.txt file
# writeFileObject = open('whWord.txt','a')
# if not whWord:
# writeFileObject.write('None' + '\n')
# else:
# whQuestion = whWord
# writeFileObject.write(whQuestion+ '\n')
finally:
print 'Done. All WH-word extracted.'
openFileObject.close()
except IOError:
pass
The result after running the code above: set([])
Is there something I am doing wrong here? I would be grateful if someone can point it out to me.
Something like this:
def whWordExtractor(inputFile):
try:
with open(inputFile) as f1:
whPattern = re.compile(r'(.*)who|what|how|where|when|why|which|whom|whose(\.*)', re.IGNORECASE)
with open('whWord.txt','a') as f2: #open file only once, to reduce I/O operations
for line in f1:
whWord = whPattern.search(line)
print whWord
if not whWord:
f2.write('None' + '\n')
else:
#As re.search returns a sre.SRE_Match object not string, so you will have to use either
# whWord.group() or better use whPattern.findall(line)
whQuestion = whWord.group()
f2.write(whQuestion+ '\n')
print 'Done. All WH-word extracted.'
except IOError:
pass
Not sure if it's what you're looking for, but you could try something like this:
def whWordExtractor(inputFile):
try:
whPattern = re.compile(r'who|what|how|where|when|why|which|whom|whose', re.IGNORECASE)
with open(inputFile, "r") as infile:
for line in infile:
whMatch = whPattern.search(line)
if whMatch:
whWord = whMatch.group()
print whWord
# save to file
else:
# no match
except IOError:
pass
Change '(.*)who|what|how|where|when|why|which|whom|whose(\.*)' to
".*(?:who|what|how|where|when|why|which|whom|whose).*\."

Categories

Resources