Python searching for exact word/phrase within a text file - python

Currently, I'm trying to search for an exact word/phrase in a text file. I am using Python 3.4
Here is the code I have so far.
import re
def main():
fileName = input("Please input the file name").lower()
term = input("Please enter the search term").lower()
fileName = fileName + ".txt"
regex_search(fileName, term)
def regex_search(file,term):
source = open(file, 'r')
destination = open("new.txt", 'w')
lines = []
for line in source:
if re.search(term, line):
lines.append(line)
for line in lines:
destination.write(line)
source.close()
destination.close()
'''
def search(file, term): #This function doesn't work
source = open(file, 'r')
destination = open("new.txt", 'w')
lines = [line for line in source if term in line.split()]
for line in lines:
destination.write(line)
source.close()
destination.close()'''
main()
In my function regex_search I use regex to search for the particular string. However, I don't know how to search for a particular phrase.
In the second function, search, I split the line into a list and search for the word in there. However, this won't be able to search for a particular phrase because I am searching for ["dog walked"] in ['the','dog','walked'] which won't return the correct lines.

edit: Considering that you don't want to match partial words ('foo' should not match 'foobar'), you need to look ahead in the data stream. The code for that is a bit awkward, so I think regex (your current regex_search with a fix) is the way to go:
def regex_search(filename, term):
searcher = re.compile(term + r'([^\w-]|$)').search
with open(file, 'r') as source, open("new.txt", 'w') as destination:
for line in source:
if searcher(line):
destination.write(line)

Related

updating a leaderboard for noughts and crosses game [duplicate]

How do I search and replace text in a file using Python 3?
Here is my code:
import os
import sys
import fileinput
print ("Text to search for:")
textToSearch = input( "> " )
print ("Text to replace it with:")
textToReplace = input( "> " )
print ("File to perform Search-Replace on:")
fileToSearch = input( "> " )
#fileToSearch = 'D:\dummy1.txt'
tempFile = open( fileToSearch, 'r+' )
for line in fileinput.input( fileToSearch ):
if textToSearch in line :
print('Match Found')
else:
print('Match Not Found!!')
tempFile.write( line.replace( textToSearch, textToReplace ) )
tempFile.close()
input( '\n\n Press Enter to exit...' )
Input file:
hi this is abcd hi this is abcd
This is dummy text file.
This is how search and replace works abcd
When I search and replace 'ram' by 'abcd' in above input file, it works as a charm. But when I do it vice-versa i.e. replacing 'abcd' by 'ram', some junk characters are left at the end.
Replacing 'abcd' by 'ram'
hi this is ram hi this is ram
This is dummy text file.
This is how search and replace works rambcd
As pointed out by michaelb958, you cannot replace in place with data of a different length because this will put the rest of the sections out of place. I disagree with the other posters suggesting you read from one file and write to another. Instead, I would read the file into memory, fix the data up, and then write it out to the same file in a separate step.
# Read in the file
with open('file.txt', 'r') as file :
filedata = file.read()
# Replace the target string
filedata = filedata.replace('abcd', 'ram')
# Write the file out again
with open('file.txt', 'w') as file:
file.write(filedata)
Unless you've got a massive file to work with which is too big to load into memory in one go, or you are concerned about potential data loss if the process is interrupted during the second step in which you write data to the file.
fileinput already supports inplace editing. It redirects stdout to the file in this case:
#!/usr/bin/env python3
import fileinput
with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace(text_to_search, replacement_text), end='')
As Jack Aidley had posted and J.F. Sebastian pointed out, this code will not work:
# Read in the file
filedata = None
with file = open('file.txt', 'r') :
filedata = file.read()
# Replace the target string
filedata.replace('ram', 'abcd')
# Write the file out again
with file = open('file.txt', 'w') :
file.write(filedata)`
But this code WILL work (I've tested it):
f = open(filein,'r')
filedata = f.read()
f.close()
newdata = filedata.replace("old data","new data")
f = open(fileout,'w')
f.write(newdata)
f.close()
Using this method, filein and fileout can be the same file, because Python 3.3 will overwrite the file upon opening for write.
You can do the replacement like this
f1 = open('file1.txt', 'r')
f2 = open('file2.txt', 'w')
for line in f1:
f2.write(line.replace('old_text', 'new_text'))
f1.close()
f2.close()
You can also use pathlib.
from pathlib2 import Path
path = Path(file_to_search)
text = path.read_text()
text = text.replace(text_to_search, replacement_text)
path.write_text(text)
(pip install python-util)
from pyutil import filereplace
filereplace("somefile.txt","abcd","ram")
Will replace all occurences of "abcd" with "ram".
The function also supports regex by specifying regex=True
from pyutil import filereplace
filereplace("somefile.txt","\\w+","ram",regex=True)
Disclaimer: I'm the author (https://github.com/MisterL2/python-util)
Open the file in read mode. Read the file in string format. Replace the text as intended. Close the file. Again open the file in write mode. Finally, write the replaced text to the same file.
try:
with open("file_name", "r+") as text_file:
texts = text_file.read()
texts = texts.replace("to_replace", "replace_string")
with open(file_name, "w") as text_file:
text_file.write(texts)
except FileNotFoundError as f:
print("Could not find the file you are trying to read.")
Late answer, but this is what I use to find and replace inside a text file:
with open("test.txt") as r:
text = r.read().replace("THIS", "THAT")
with open("test.txt", "w") as w:
w.write(text)
DEMO
With a single with block, you can search and replace your text:
with open('file.txt','r+') as f:
filedata = f.read()
filedata = filedata.replace('abc','xyz')
f.truncate(0)
f.write(filedata)
Your problem stems from reading from and writing to the same file. Rather than opening fileToSearch for writing, open an actual temporary file and then after you're done and have closed tempFile, use os.rename to move the new file over fileToSearch.
My variant, one word at a time on the entire file.
I read it into memory.
def replace_word(infile,old_word,new_word):
if not os.path.isfile(infile):
print ("Error on replace_word, not a regular file: "+infile)
sys.exit(1)
f1=open(infile,'r').read()
f2=open(infile,'w')
m=f1.replace(old_word,new_word)
f2.write(m)
Using re.subn it is possible to have more control on the substitution process, such as word splitted over two lines, case-(in)sensitive match. Further, it returns the amount of matches which can be used to avoid waste of resources if the string is not found.
import re
file = # path to file
# they can be also raw string and regex
textToSearch = r'Ha.*O' # here an example with a regex
textToReplace = 'hallo'
# read and replace
with open(file, 'r') as fd:
# sample case-insensitive find-and-replace
text, counter = re.subn(textToSearch, textToReplace, fd.read(), re.I)
# check if there is at least a match
if counter > 0:
# edit the file
with open(file, 'w') as fd:
fd.write(text)
# summary result
print(f'{counter} occurence of "{textToSearch}" were replaced with "{textToReplace}".')
Some regex:
add the re.I flag, short form of re.IGNORECASE, for a case-insensitive match
for multi-line replacement re.subn(r'\n*'.join(textToSearch), textToReplace, fd.read()), depending on the data also '\n{,1}'. Notice that for this case textToSearch must be a pure string, not a regex!
Besides the answers already mentioned, here is an explanation of why you have some random characters at the end:
You are opening the file in r+ mode, not w mode. The key difference is that w mode clears the contents of the file as soon as you open it, whereas r+ doesn't.
This means that if your file content is "123456789" and you write "www" to it, you get "www456789". It overwrites the characters with the new input, but leaves any remaining input untouched.
You can clear a section of the file contents by using truncate(<startPosition>), but you are probably best off saving the updated file content to a string first, then doing truncate(0) and writing it all at once.
Or you can use my library :D
I got the same issue. The problem is that when you load a .txt in a variable you use it like an array of string while it's an array of character.
swapString = []
with open(filepath) as f:
s = f.read()
for each in s:
swapString.append(str(each).replace('this','that'))
s = swapString
print(s)
I tried this and used readlines instead of read
with open('dummy.txt','r') as file:
list = file.readlines()
print(f'before removal {list}')
for i in list[:]:
list.remove(i)
print(f'After removal {list}')
with open('dummy.txt','w+') as f:
for i in list:
f.write(i)
you can use sed or awk or grep in python (with some restrictions). Here is a very simple example. It changes banana to bananatoothpaste in the file. You can edit and use it. ( I tested it worked...note: if you are testing under windows you should install "sed" command and set the path first)
import os
file="a.txt"
oldtext="Banana"
newtext=" BananaToothpaste"
os.system('sed -i "s/{}/{}/g" {}'.format(oldtext,newtext,file))
#print(f'sed -i "s/{oldtext}/{newtext}/g" {file}')
print('This command was applied: sed -i "s/{}/{}/g" {}'.format(oldtext,newtext,file))
if you want to see results on the file directly apply: "type" for windows/ "cat" for linux:
####FOR WINDOWS:
os.popen("type " + file).read()
####FOR LINUX:
os.popen("cat " + file).read()
I have done this:
#!/usr/bin/env python3
import fileinput
import os
Dir = input ("Source directory: ")
os.chdir(Dir)
Filelist = os.listdir()
print('File list: ',Filelist)
NomeFile = input ("Insert file name: ")
CarOr = input ("Text to search: ")
CarNew = input ("New text: ")
with fileinput.FileInput(NomeFile, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace(CarOr, CarNew), end='')
file.close ()
I modified Jayram Singh's post slightly in order to replace every instance of a '!' character to a number which I wanted to increment with each instance. Thought it might be helpful to someone who wanted to modify a character that occurred more than once per line and wanted to iterate. Hope that helps someone. PS- I'm very new at coding so apologies if my post is inappropriate in any way, but this worked for me.
f1 = open('file1.txt', 'r')
f2 = open('file2.txt', 'w')
n = 1
# if word=='!'replace w/ [n] & increment n; else append same word to
# file2
for line in f1:
for word in line:
if word == '!':
f2.write(word.replace('!', f'[{n}]'))
n += 1
else:
f2.write(word)
f1.close()
f2.close()
def word_replace(filename,old,new):
c=0
with open(filename,'r+',encoding ='utf-8') as f:
a=f.read()
b=a.split()
for i in range(0,len(b)):
if b[i]==old:
c=c+1
old=old.center(len(old)+2)
new=new.center(len(new)+2)
d=a.replace(old,new,c)
f.truncate(0)
f.seek(0)
f.write(d)
print('All words have been replaced!!!')
I have worked this out as an exercise of a course: open file, find and replace string and write to a new file.
class Letter:
def __init__(self):
with open("./Input/Names/invited_names.txt", "r") as file:
# read the list of names
list_names = [line.rstrip() for line in file]
with open("./Input/Letters/starting_letter.docx", "r") as f:
# read letter
file_source = f.read()
for name in list_names:
with open(f"./Output/ReadyToSend/LetterTo{name}.docx", "w") as f:
# replace [name] with name of the list in the file
replace_string = file_source.replace('[name]', name)
# write to a new file
f.write(replace_string)
brief = Letter()
Like so:
def find_and_replace(file, word, replacement):
with open(file, 'r+') as f:
text = f.read()
f.write(text.replace(word, replacement))
def findReplace(find, replace):
import os
src = os.path.join(os.getcwd(), os.pardir)
for path, dirs, files in os.walk(os.path.abspath(src)):
for name in files:
if name.endswith('.py'):
filepath = os.path.join(path, name)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)

Print the title from a txt file from the given word

I have a txt file with the following info:
545524---Python foundation---Course---https://img-c.udemycdn.com/course/100x100/647442_5c1f.jpg---Outsourcing Development Work: Learn My Proven System To Hire Freelance Developers
Another line with the same format but different info and continue....
Here on line 1, Python foundation is the course title. If a user has input "foundation" how do I print out Python foundation? It's basically printing the whole title of a course based on the given word.
I can use something like:
input_text = 'foundation'
file1 = open("file.txt", "r")
readfile = file1.read()
if input_text in readfile:
#This prints only foundation keyword not the whole title
I assume that your input file has multiple lines separated by enters in this format:
<Course-id>---<Course-name>---Course---<Course-image-link>---<Desc>
input_text = 'foundation'
file1 = open('file.txt', 'r')
lines = file1.readlines()
for line in lines:
book_title_pattern = r'---([\w\d\s_\.,;:()]+)---'
match = re.search(book_title_pattern, line)
if match:
matched_title = match.groups(1)[0]
if input_text in matched_title:
print(matched_title)
Get the key value that you're searching for. User input perhaps or we'll hard-code it here for demo' purposes.
Open the file and read one line at a time. Use RE to parse the line looking for a specific pattern. Check that we've actually found a token matching the RE criterion then check if it contains the 'key' value. Print result as appropriate.
import re
key = 'foundation'
with open('input.txt') as infile:
for line in map(str.strip, infile):
if (t := re.findall('---([a-zA-Z\s]+)---', line)) and key in t[0]:
print(t[0])
You can use regex to match ---Course name--- using ---([a-zA-Z ]+)---. This will give you all the course names. Then you can check for the user_input in each course and print the course name if you find user_input in it.:
import re
user_input = 'foundation'
file1 = open("file.txt", "r")
readfile = file1.read()
course_name = re.findall('---([a-zA-Z ]+)---', readfile)
for course in course_name:
if user_input in course: #Then check for 'foundation' in course_name
print(course)
Output:
Python foundation

So I just created a text file and copied text from an already existing file using a with loop. How do I open newly created file in same prog?

I'm making a program that takes text from an input file, then you input a file where it copies the already existing file text. Then, I need to replace a few words there and print the count of how many of these words were replaced. This is my code so far, but since with loops close the newly created file, I have no idea how to open it back again for reading and writing and counting. This is my awful code so far:
filename=input("Sisesta tekstifaili nimi: ")
inputFile=open(filename, "r")
b=input("Sisesta uue tekstifaili nimi: ")
uusFail=open(b+".txt", "w+")
f=uusFail
with inputFile as input:
with uusFail as output:
for line in input:
output.write(line)
lines[]
asendus = {'hello':'tere', 'Hello':'Tere'}
with uusFail as infile
for line in infile
for src, target in asendus
line = line, replace(src, target)
lines.append(line)
with uusFail as outfile:
for line in lines:
outfile.write(line)
There are a lot of unnecessary loops in your code. when you read the file, you can treat it as a whole and count the number of occurrences and replace them. Here is a modified version of your code:
infile = input('Enter file name: ')
outfile = input('enter out file: ')
with open(infile) as f:
content = f.read()
asendus = {'hello':'tere', 'Hello':'Tere'}
my_count = 0
for src, target in asendus.items():
my_count += content.count(src)
content = content.replace(src, target)
with open(f'{outfile}.txt','w+' ) as f:
f.write(content)
You need to reopen the file in the second block of code:
with open(b+".txt", "r") as infile:

parse, find chapters, and write out as separate files

I am having difficulty getting the right code to parse out chapters from this ebook and then have the 27 chapters to print out into their own text file. the farthest i've gotten is to print "CHAPTER-1.txt". I don't want to hard code anything and am unsure where i've completely missed the mark.
infile = open('dracula.txt', 'r')
readlines = infile.readlines()
toc_list = readlines[74:185]
toc_text_lines = []
for line in toc_list:
if len(line) > 1:
stripped_line = line.strip()
toc_text_lines.append(stripped_line)
#print(len(toc_text_lines))
chaptitles = []
for text_lines in toc_text_lines:
split_text_line = text_lines.split()
if split_text_line[-1].isdigit():
chaptitles.append(text_lines)
#print(len(chaptitles))
print(chaptitles)
infile.close()
import re
with open('dracula.txt') as f:
book = f.readlines()
while book:
line = book.pop(0)
if "CHAPTER" in line and book.pop(0) == '\n':
for title in chapters_names_list: ['CHAPTER I.', 'CHAPTER II.',
'CHAPTER III.']
with open("{}.txt".format(chapters_names_list), 'w') :
I think you could benefit from generators, suppose one of the ebooks is too big to fit into memory, you will have some issues.
What you can do is construct sort of a data processing pipeline, first look for the file(ebook.txt) in the filesystem, though have in mind that we need all functions to be as general as possible, once we have the filename we open it and yield one line at a time, and finally we scan each line for 'CHAPTER I.', 'CHAPTER II.', etc
import os
import re
import fnmatch
def find_files(pattern, path):
"""
Here you can find all the filenames that match a specific pattern
using shell wildcard pattern that way you avoid hardcoding
the file pattern i.e 'dracula.txt'
"""
for root, dirs, files in os.walk(path):
for name in fnmatch.filter(files, pattern):
yield os.path.join(root, name)
def file_opener(filenames):
"""
Open a sequence of filenames one at a time
and make sure to close the file once we are done
scanning its content.
"""
for filename in filenames:
if filename.endswith('.txt'):
f = open(filename, 'rt')
yield f
f.close()
def chain_generators(iterators):
"""
Chain a sequence of iterators together
"""
for it in iterators:
# Look up yield from if you're unsure what it does
yield from it
def grep(pattern, lines):
"""
Look for a pattern in a line i.e 'CHAPTER I.'
"""
pat = re.compile(pattern)
for line in lines:
if pat.search(line):
yield line
# A simple way to use these functions together
logs = find_files('dracula*', 'Path/to/files')
files = file_opener(logs)
lines = chain_generators(files)
each_line = grep('CHAPTER I.', lines)
for match in each_line:
print(match)
You can build on top of these implementation to accomplish what you're trying to do.
Let me know if this helped.

python: search file for string

I have tried to create a python function which takes in 2 parameters; a file name and a search string. In this case the file name is the script itself (script.py) and the search string is 'name = "JOHN"'
#!/usr/local/bin/python2.7
import os, sys
#################
# Variable string
name = "JOHN"
#################
# Main function
def search_script_for_string(filename, searchString):
f = open(filename,'r') #open the given filename then
filedata = f.read() #assign it to variable then
f.close() #close the open filename
for lines in filedata: #loop through each line in the filedata variable
if searchString in lines: #if search string is found, then do all of this
print ('Found string: %s') % searchString
return True
else: #if not found, then do all of this
print ('Did not find: %s') % searchString
return False
break
#################
# Pass the file name and the search string parameter to the function
search_script_for_string("test.py","name = \"" + name + "\"")
The problem is that it doesn't return expected results:
$ Did not find: name = "JOHN"
When it meant to say:
$ Found string: name = "JOHN"
If anyone can help correct my understanding of where I'm going wrong here, I'd be massively appreciative. Thanks
f.read() returns the entire contents of the file as a single string. You then iterate over those contents -- but iterating over a string yields only 1 character at a time so there is no way a character will contain the substring you are looking for.
def search_script_for_string(filename, searchString):
with open(filename, 'r') as f:
return searchString in f.read()
should do the trick. Alternatively, if you want to search line-by-line:
def search_script_for_string(filename, searchString):
with open(filename, 'r') as f:
for line in f:
return searchString in line
You are iterating over each character of the file by calling for c in f.read().
Use for line in f and you will indeed iterate over each line.
Also prefer the use of with, this makes your code a lot more robust.
So this would be better:
with open('fileName') as f:
for line in f:
#process

Categories

Resources