Parsing txt file for word occurrences python

Parsing txt file for word occurrences python - python

I'm fairly new to python and found a personal project for myself. I am trying to parse through a large text file to find word occurrences, but I cannot seem to get the code to work. It starts like this:
file = 'randomfile.txt'
with open(file) as f:
word = input("Enter a word: ")
line = f.readline()
num_line = 1
while line:
if word in line:
print("line {}: {}".format(num_line, line.strip()))
print("Here are ___ I found with the word" + word)
num_line += 1
f.close()
This code will run but it will not give an output for search words, and I cant see a reason why not, unless python is not reading the text file in the path, or if the fourth line of code is not being read properly? How can I go about fixing this?

This will work:
with open(randonfile.txt",'r') as f:
word = input("Enter a word: ")
line = f.readline()
num_line = 1
for words in line.split():
if word in words:
print("line {}: {}".format(num_line, line.strip()))
print("Here are ___ I found with the word" + word)
num_line += 1

The problem with your code is that only read the first line, and never loop through the entire file.
with open(file) as f:
word = input("Enter a word: ")
line = f.readline() # this is the only place you read a line
num_line = 1
while line:
if word in line:
print("line {}: {}".format(num_line, line.strip()))
print("Here are ___ I found with the word" + word)
num_line += 1
You should read the next line in the while loop, like this:
with open(file) as f:
word = input("Enter a word: ")
line = f.readline()
num_line = 1
while line:
if word in line:
print("line {}: {}".format(num_line, line.strip()))
line = f.readline() # <-- read the next line
print("Here are ___ I found with the word" + word)
num_line += 1
Now you are reading through the file. The next problem is you only increment the line number num_line += 1 once the while loop is over, you need to move this to within the loop so it tracks how many lines have been processed.
with open(file) as f:
word = input("Enter a word: ")
line = f.readline()
num_line = 1
while line:
if word in line:
print("line {}: {}".format(num_line, line.strip()))
line = f.readline() # <-- read the next line
num_line +=1 # <-- increase the counter in the loop
print("Here are ___ I found with the word" + word)
You don't need f.close(), the with_statement automatically closes files. You can also loop directly over the file pointer f, to read each line, like this:
file = 'randomfile.txt'
with open(file) as f:
word = input("Enter a word: ")
num_line = 1
for line in f: # step through each line of the file
if word in line:
print("line {}: {}".format(num_line, line.strip()))
num_line +=1 # <-- increase the counter in the loop
print("Here are ___ I found with the word" + word)
# f.close() -- not needed
I will leave it for you to fix the print statement.

You can do something like this:
word = input("Enter a word: ")
with open(file) as f:
for idx, line in enumerate(f):
# we need to use lower() and split() because we don't want to count if word = 'aaa' and line 'AAAaaa' returns True
if word.lower() in line.lower().split():
print("line {}: {}".format(idx, line.strip()))

Related

Deleting a specific word form a text file in python

I used this code to delete a word from a text file.
f = open('./test.txt','r')
a = ['word1','word2','word3']
lst = []
for line in f:
for word in a:
if word in line:
line = line.replace(word,'')
lst.append(line)
f.close()
f = open('./test.txt','w')
for line in lst:
f.write(line)
f.close()
But for some reason if the words have the same characters, all those characters get deleted. So for e.g
in my code:
def cancel():
global refID
f1=open("refID.txt","r")
line=f1.readline()
flag = 0
while flag==0:
refID=input("Enter the reference ID or type 'q' to quit: ")
for i in line.split(','):
if refID == i:
flag=1
if flag ==1:
print("reference ID found")
cancelsub()
elif (len(refID))<1:
print("Reference ID not found, please re-enter your reference ID\n")
cancel()
elif refID=="q":
flag=1
else:
print("reference ID not found\n")
menu()
def cancelsub():
global refIDarr, index
refIDarr=[]
index=0
f = open('flightbooking.csv')
csv_f = csv.reader(f)
for row in csv_f:
refIDarr.append(row[1])
for i in range (len(refIDarr)):
if refID==refIDarr[i]:
index=i
print(index)
while True:
proceed=input("You are about to cancel your flight booking, are you sure you would like to proceed? y/n?: ")
while proceed>"y" or proceed<"n" or (proceed>"n" and proceed<"y") :
proceed=input("Invalid entry. \nPlease enter y or n: ")
if proceed=="y":
Continue()
break
elif proceed=="n":
main_menu
break
exit
break
def Continue():
lines = list()
with open('flightbooking.csv', 'r') as readFile:
reader = csv.reader(readFile)
for row in reader:
lines.append(row)
for field in row:
if field ==refID:
lines.remove(row)
break
with open('flightbooking.csv', 'w') as writeFile:
writer = csv.writer(writeFile)
writer.writerows(lines)
f = open('refID.txt','r')
a=refIDarr[index]
print(a)
lst = []
for line in f:
for word in a:
if word in line:
line = line.replace(word,'')
lst.append(line)
print(lst)
f.close()
f = open('refID.txt','w')
for line in lst:
f.write(line)
f.close()
print("Booking successfully cancelled")
menu()
When the code is run, the refID variable has one word stored in it, and it should replace just that word with a blank space, but it takes that word for e.g 'AB123', finds all other words which might have an 'A' or a 'B' or the numbers, and replace all of them. How do I make it so it only deletes the word?
Text file before running code:
AD123,AB123
Expected Output in the text file:
AD123,
Output in text file:
D,
Edit: I have added the entire code, and maybe you can help now after seeing that the array is being appended to and then being used to delete from a text file.

here's my opinion.
refIDarr = ["AB123"]
a = refIDarr[0] => a = "AB123"
strings in python are iterable, so when you do for word in a, you're getting 5 loops where each word is actually a letter.
Something like the following is being executed.
if "A" in line:
line = line.replace("A","")
if "B" in line:
line = line.replace("B","")
if "1" in line:
line = line.replace("1","")
if "2" in line:
line = line.replace("2","")
if "3" in line:
line = line.replace("3","")
they correct way to do this is loop over refIDarr
for word in refIDarr:
line = line.replace(word,'')
NOTE: You don't need the if statement, since if the word is not in the line it will return the same line as it was.
"abc".replace("bananan", "") => "abc"
Here's a working example:
refIDarr = ["hello", "world", "lol"]
with open('mytext.txt', "r") as f:
data = f.readlines()
for word in refIDarr:
data = [line.replace(word, "") for line in data]
with open("mytext.txt", "w") as newf:
newf.writelines(data)

The problem is here:
a=refIDarr[index]
If refIDarr is a list of words, accessing specific index makes a be a word. Later, when you iterate over a (for word in a:), word becomes a letter and not a word as you expect, which causes eventually replacing characters of word instead the word itself in your file.
To avoid that, remove a=refIDarr[index] and change your loop to be:
for line in f:
for word in refIDarr:
if word in line:
line = line.replace(word,'')

Why is my program not reading the first line of code in the referenced file(fileName)?

def main():
read()
def read():
fileName=input("Enter the file you want to count: ")
infile=open(fileName , "r")
text=infile.readline()
count=0
while text != "":
text=str(count)
count+=1
text=infile.readline()
print(str(count)+ ": " + text)
infile.close()
main()
-the referenced .txt file has only two elements
44
33
-the output of this code should look like
1: 44
2: 33
-my output is
1: 33
2:
im not sure why the program is not picking up the first line in the referenced .txt file. The line numbers are correct however 33 should be second to 44.

The reason is explained in the comments:
def main():
read()
def read():
fileName=input("Enter the file you want to count: ")
infile=open(fileName , "r")
text=infile.readline() ##Reading the first line here but not printing
count=0
while text != "":
text=str(count)
count+=1
text=infile.readline() ##Reading the 2nd line here
print(str(count)+ ": " + text) ##Printing the 2nd line here, missed the first
##line
infile.close()
main()
Modify the program as:
def main():
read()
def read():
fileName= input("Enter the file you want to count: ")
infile = open(fileName , "r")
text = infile.readline()
count = 1 # Set count to 1
while text != "":
print(str(count)+ ": " + str(text)) # Print 1st line here
count = count + 1 # Increment count to 2
text = infile.readline() # Read 2nd line
infile.close() # Close the file
main()

def main():
read()
def read():
fileName=input("Enter the file you want to count: ")
with open(fileName,'r') as f:
print('\n'.join([' : '.join([str(i+1),v.rstrip()]) for i,v in enumerate(f.readlines())]))
main()

I'm very confused by your read function. You start by reading the first line into text:
text=infile.readline()
Presumable at this point text contains 44.
You then immediately demolish this value before you've done anything with it by overwriting it with:
text = str(count)
ie you read two lines before printing anything at all.

You should print the value of text before you overwrite it with the next readline.
Simply move the print statement before readline:
while text != "":
count+=1
print(str(count)+ ": " + text)
text=infile.readline()

Memory Error while running python script on 4GB file

I am trying to count number of words that has length between 1 and 5, file size is around 4GB end I am getting memory error.
import os
files = os.listdir('C:/Users/rram/Desktop/')
for file_name in files:
file_path = "C:/Users/rram/Desktop/"+file_name
f = open (file_path, 'r')
text = f.readlines()
update_text = ''
wordcount = {}
for line in text:
arr = line.split("|")
word = arr[13]
if 1<=len(word)<6:
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
update_text+= '|'.join(arr)
print (wordcount) #print update_text
print 'closing', file_path, '\t', 'total files' , '\n\n'
f.close()
At the end i get a MemoryError on this line text = f.readlines()
Can you pelase help to optimize it.

As suggested in the comments you should read the file line by line and not the entire file.
For example :
count = 0
with open('words.txt','r') as f:
for line in f:
for word in line.split():
if(1 <= len(word) <=5):
count=count+1
print(count)
EDIT :
If you only want to count the words in 14-th column and split by | instead then :
count = 0
with open('words.txt','r') as f:
for line in f:
iterator = 0
for word in line.split("|"):
if(1 <= len(word) <=5 and iterator == 13):
count=count+1
iterator = iterator +1
print(count)
note that you should avoid to write this
arr = line.split("|")
word = arr[13]
since the line may contains less than 14 words, which can result in a segmentation error.

Not overwriting text file properly (Python)

My program is supposed to take a text file, print the contents, count the lines, ask for a string input, remove every occurrence of that string, say how many times the string was removed, and then output the new contents of the file. We were asked to use one file, not an input and an output file.
My code works and does everything it's supposed to up until I try to store the changes in the file at the end of the char_count function and the print_output function seems to not be working right at all.
If I have an input file of contents:
Apples
Oranges
Bananas
Apples
Oranges
Bananas
if I try to remove Bananas, the resulting file contents for the input file is:
ApplesOrangesApplesOrangesles
Oranges
Bananas
I've been trying to figure out what's going on with no progress, and our course textbook doesn't seem to mention overwriting input files, but we're required to do it for an assignment. What is wrong with my last two functions?
def main():
input_file_name = input("Please Enter the name of your text file: ")
infile = open(input_file_name, "r+")
print()
print("---------------------------------------")
print("THE FILE CONTENTS ARE")
print("---------------------------------------")
print_file(infile)
print("---------------------------------------")
count_lines(infile)
print("---------------------------------------")
input_string = input("Please enter the word or string of words you want to remove from the text file: ")
print("---------------------------------------")
char_count(infile, input_string)
print("---------------------------------------")
print("THE NEW FILE CONTENTS ARE")
print_output(infile)
print("---------------------------------------")
infile.close()
def print_file(infile):
infile.seek(0)
allLines = infile.readlines()
for line in allLines:
text = line.rstrip()
print(text)
def count_lines(infile):
infile.seek(0)
allLines = infile.readlines()
count = 0
char = " "
for line in allLines :
text = line.rstrip()
while char != "":
char = infile.read(1)
count = count + 1
print("THE NUMBER OF LINES IS: %d " % count)
def char_count(infile, input_string) :
count = 0
infile.seek(0)
allLines = infile.readlines()
infile.seek(0)
for line in allLines:
while input_string in line:
line = line.replace(input_string, "")
count = count + 1
text = line.rstrip()
infile.write(text)
print("NUMBER OF OCCURRENCES REMOVED IS: %d" % count)
def print_output(infile):
infile.seek(0)
allLines = infile.readlines()
for line in allLines:
text = line.rstrip()
print(text)
main()

you have to truncate the file first to get the required output.
def main():
input_file_name = input("Please Enter the name of your text file: ")
infile = open(input_file_name, "r+")
print()
print("---------------------------------------")
print("THE FILE CONTENTS ARE")
print("---------------------------------------")
print_file(infile)
print("---------------------------------------")
count_lines(infile)
print("---------------------------------------")
input_string = input("Please enter the word or string of words you want to remove from the text file: ")
print("---------------------------------------")
char_count(infile, input_string)
print("---------------------------------------")
print("THE NEW FILE CONTENTS ARE")
print_output(infile)
print("---------------------------------------")
infile.close()
def print_file(infile):
infile.seek(0)
allLines = infile.readlines()
for line in allLines:
text = line.rstrip()
print(text)
def count_lines(infile):
infile.seek(0)
allLines = infile.readlines()
count = 0
char = " "
for line in allLines :
text = line.rstrip()
while char != "":
char = infile.read(1)
count = count + 1
print("THE NUMBER OF LINES IS: %d " % count)
def char_count(infile, input_string) :
count = 0
infile.seek(0)
allLines = infile.readlines()
infile.seek(0)
infile.truncate() #Empty your file first to rewrite it
for line in allLines:
while input_string in line:
line = line.replace(input_string, "")
count = count + 1
text = line.rstrip()
if(text != ""):
infile.write(text + "\n") #To write in multiple lines
print("NUMBER OF OCCURRENCES REMOVED IS: %d" % count)
def print_output(infile):
infile.seek(0)
allLines = infile.readlines()
for line in allLines:
text = line.rstrip()
print(text)
main()

Indexing and search in a text file

I have a text file that contains the contents of a book. I want to take this file and build an index which allows the user to search through the file to make searches.
The search would consist of entering a word. Then, the program would return the following:
Every chapter which includes that word.
The line number of the line
which contains the word.
The entire line the word is on.
I tried the following code:
infile = open(file)
Dict = {}
word = input("Enter a word to search: ")
linenum = 0
line = infile.readline()
for line in infile
linenum += 1
for word in wordList:
if word in line:
Dict[word] = Dict.setdefault(word, []) + [linenum]
print(count, word)
line = infile.readline()
return Dict
Something like this does not work and seems too awkward for handling the other modules which would require:
An "or" operator to search for one word or another
An "and" operator to search for one word and another in the same chapter
Any suggestions would be great.

def classify_lines_on_chapter(book_contents):
lines_vs_chapter = []
for line in book_contents:
if line.isupper():
current_chapter = line.strip()
lines_vs_chapter.append(current_chapter)
return lines_vs_chapter
def classify_words_on_lines(book_contents):
words_vs_lines = {}
for i, line in enumerate(book_contents):
for word in set([word.strip(string.punctuation) for word in line.split()]):
if word:
words_vs_lines.setdefault(word, []).append(i)
return words_vs_lines
def main():
skip_lines = 93
with open('book.txt') as book:
book_contents = book.readlines()[skip_lines:]
lines_vs_chapter = classify_lines_on_chapter(book_contents)
words_vs_lines = classify_words_on_lines(book_contents)
while True:
word = input("Enter word to search - ")
# Enter a blank input to exit
if not word:
break
line_numbers = words_vs_lines.get(word, None)
if not line_numbers:
print("Word not found!!\n")
continue
for line_number in line_numbers:
line = book_contents[line_number]
chapter = lines_vs_chapter[line_number]
print("Line " + str(line_number + 1 + skip_lines))
print("Chapter '" + str(chapter) + "'")
print(line)
if __name__ == '__main__':
main()
Try it on this input file. Rename it as book.txt before running it.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parsing txt file for word occurrences python - python

This will work: with open(randonfile.txt",'r') as f: word = input("Enter a word: ") line = f.readline() num_line = 1 for words in line.split(): if word in words: print("line {}: {}".format(num_line, line.strip())) print("Here are ___ I found with the word" + word) num_line += 1

Related

Deleting a specific word form a text file in python

Why is my program not reading the first line of code in the referenced file(fileName)?

Memory Error while running python script on 4GB file

Not overwriting text file properly (Python)

Indexing and search in a text file

Categories

Resources