compare an exact word with the txt file - python

i am trying to get the exact word match from my file along with their line no.
like when i search for abc10 it gives me all the possible answers e.g abc102 abc103 etc
how can i limitize my code to only print what i commanded..
here is my code!
lineNo = 0
linesFound = []
inFile= open('rxmop.txt', 'r')
sKeyword = input("enter word ")
done = False
while not done :
pos = inFile.tell()
sLine = inFile.readline()
if sLine == "" :
done = True
break
if (sLine.find( sKeyword ) != -1):
print ("Found at line: "+str(lineNo))
tTuple = lineNo, pos
linesFound.append( tTuple )
lineNo = lineNo + 1
done = False
while not done :
command = int( input("Enter the line you want to view: ") )
if command == -1 :
done = True
break
for tT in linesFound :
if command == tT[0] :
inFile.seek( tT[1] )
lLine = inFile.readline()
print ("The line at position " + str(tT[1]) + "is: " + lLine)

"like when i search for abc10 it gives me all the possible answers e.g abc102 abc103 etc"
You split each record and compare whole "words" only.
to_find = "RXOTG-10"
list_of_possibles = ["RXOTG-10 QTA5777 HYB SY G12",
"RXOTG-100 QTA9278 HYB SY G12"]
for rec in list_of_possibles:
words_list=rec.strip().split()
if to_find in words_list:
print "found", rec
else:
print " NOT found", rec

Related

ValueError: substring not found on lip reading code

This is what I have gotten while trying to run step 3 of this source code:
https://github.com/carykh/lazykh
Error:
Traceback (most recent call last):
File "C:\Users\User\Desktop\lazykh-main\code\scheduler.py", line 93, in
OS_nextIndex = originalScript.index(wordString,OS_IndexAt)+len(wordString)
ValueError: substring not found
Code:
import argparse
import os.path
import json
import numpy as np
import random
def addPhoneme(p, t):
global prevPhoneme
global f
if p != prevPhoneme:
strings[4] += (str.format('{0:.3f}', t)+",phoneme,"+p+"\n")
prevPhoneme = p
def pickNewPose(t):
global pose
global prevPose
global POSE_COUNT
global prevPhoneme
global f
newPose = -1
while newPose == -1 or newPose == pose or newPose == prevPose:
newPose = int(random.random()*POSE_COUNT)
prevPose = pose
pose = newPose
strings[3] += (str.format('{0:.3f}', t)+",pose,"+str(pose)+"\n")
prevPhoneme = "na"
strings = [""]*5
POSE_COUNT = 5
emotions = {}
emotions["explain"] = 0
emotions["happy"] = 1
emotions["sad"] = 2
emotions["angry"] = 3
emotions["confused"] = 4
emotions["rq"] = 5
mouthList = [["aa","a"],["ae","a"],["ah","a"],["ao","a"],["aw","au"],
["ay","ay"],["b","m"],["ch","t"],["d","t"],["dh","t"],
["eh","a"],["er","u"],["ey","ay"],["f","f"],["g","t"],
["hh","y"],["ih","a"],["iy","ay"],["jh","t"],["k","t"],
["l","y"],["m","m"],["n","t"],["ng","t"],["ow","au"],
["oy","ua"],["p","m"],["r","u"],["s","t"],["sh","t"],
["t","t"],["th","t"],["uh","u"],["uw","u"],["v","f"],
["w","u"],["y","y"],["z","t"],["zh","t"],
["oov","m"]] # For unknown phonemes, the stick figure will just have a closed mouth ("mmm")
mouths = {}
for x in mouthList:
mouths[x[0]] = x[1]
ENDING_PHONEME = "m"
STOPPERS = [",",";",".",":","!","?"]
parser = argparse.ArgumentParser(description='blah')
parser.add_argument('--input_file', type=str, help='the script')
args = parser.parse_args()
INPUT_FILE = args.input_file
f = open(INPUT_FILE+".txt","r+")
originalScript = f.read()
f.close()
f = open(INPUT_FILE+".json","r+")
fileData = f.read()
f.close()
data = json.loads(fileData)
WORD_COUNT = len(data['words'])
pose = -1
prevPose = -1
prevPhoneme = "na"
emotion = "0"
pararaph = 0
image = 0
OS_IndexAt = 0
pickNewPose(0)
strings[1] += "0,emotion,0\n"
strings[0] += "0,paragraph,0\n"
strings[2] += "0,image,0\n"
strings[4] += "0,phoneme,m\n"
for i in range(WORD_COUNT):
word = data['words'][i]
if "start" not in word:
continue
wordString = word["word"]
timeStart = word["start"]
OS_nextIndex = originalScript.index(wordString,OS_IndexAt)+len(wordString)
if "<" in originalScript[OS_IndexAt:]:
tagStart = originalScript.index("<",OS_IndexAt)
tagEnd = originalScript.index(">",OS_IndexAt)
if OS_nextIndex > tagStart and tagEnd >= OS_nextIndex:
OS_nextIndex = originalScript.index(wordString,tagEnd)+len(wordString)
nextDigest = originalScript[OS_IndexAt:OS_nextIndex]
if "\n" in nextDigest and data['words'][i-1]['case'] != 'not-found-in-audio' and (prevPhoneme == "a" or prevPhoneme == "f" or prevPhoneme == "u" or prevPhoneme == "y"):
addPhoneme("m", data['words'][i-1]["end"])
"""print(wordString)
print(str(OS_IndexAt)+", "+str(OS_nextIndex))
print(nextDigest)
print("")"""
pickedPose = False
for stopper in STOPPERS:
if stopper in nextDigest:
pickNewPose(timeStart)
pickedPose = True
if "<" in nextDigest:
leftIndex = nextDigest.index("<")+1
rightIndex = nextDigest.index(">")
emotion = emotions[nextDigest[leftIndex:rightIndex]]
strings[1] += (str.format('{0:.3f}', timeStart)+",emotion,"+str(emotion)+"\n")
prevPhoneme = "na"
if "\n\n" in nextDigest:
pararaph += 1
image += 1 # The line of the script advances 2 lines whenever we hit a /n/n.
strings[0] += (str.format('{0:.3f}', timeStart)+",paragraph,"+str(pararaph)+"\n")
prevPhoneme = "na"
if "\n" in nextDigest:
image += 1
strings[2] += (str.format('{0:.3f}', timeStart)+",image,"+str(image)+"\n")
prevPhoneme = "na"
if not pickedPose:
pickNewPose(timeStart) # A new image means we also need to have a new pose
phones = word["phones"]
timeAt = timeStart
for phone in phones:
timeAt += phone["duration"]
phoneString = phone["phone"]
if phoneString == "sil":
truePhone = "m"
else:
truePhone = mouths[phoneString[:phoneString.index("_")]]
if len(truePhone) == 2:
addPhoneme(truePhone[0], timeAt-phone["duration"])
addPhoneme(truePhone[1], timeAt-phone["duration"]*0.5)
else:
addPhoneme(truePhone, timeAt-phone["duration"])
OS_IndexAt = OS_nextIndex
f = open(INPUT_FILE+"_schedule.csv","w+")
for i in range(len(strings)):
f.write(strings[i])
if i < len(strings)-1:
f.write("SECTION\n")
f.flush()
f.close()
print(f"Done creating schedule for {INPUT_FILE}.")
The
ValueError: substring not found
occurs when you try to find the index of a substring in a string which does not contain it in the specified (or default) section, using the index function.
The index method takes 3 parameters:
value
start
end
and it searches for the value between start and end.
So, the error occurred because the substring was not found in the section where it was searched for. The line of
OS_nextIndex = originalScript.index(wordString,tagEnd)+len(wordString)
searches for wordString, starting from tagEnd and searches for the likes of
<span>yourwordstring</span>
, but in your case it was not found. You can do one of the following to solve the issue:
you can fix your input if it should always have a match for the search
you can handle the error when the index throws the error
you can use find instead, see https://bobbyhadz.com/blog/python-valueerror-substring-not-found
Note that find also has three parameters, as you can read from https://www.w3schools.com/python/ref_string_find.asp

Truncate ''space'' issue x/002

import re
with open("./teste/counter.txt", "r+") as count:
countread = count.read()
inputvar = input("Counting - write anything: ")
if countread == "":
print("Countread is ''None''. Adding to text file number ''1''.")
count.write('1')
else:
count.truncate(0)
countread = countread.replace(' ', '')
countplus = int(countread) + 1
print(countread)
count.write(str(countplus))
count.close()
I am trying to erase the file with count.truncate(0) but after it adds 1, and goes to 2 in my text file, at 3 I get the error:
ValueError: invalid literal for int() with base 10: '\x002'
For the line ''countplus = ...''
EDIT: By the way the ''countread replace'' was a try to fix this issue.
Fixed it with this
while 3>2:
with open("./teste/counter.txt", "r+") as count:
countread = count.read()
if countread == "":
countread = "0"
inputvar = input("Counting " + countread + " write anything: ")
if countread == "0":
count.write('1')
else:
countplus = int(countread) + 1
count.truncate(0)
count.seek(0)
countread = count.read()
count.write(str(countplus))
count.close()

Identifying spaces between commas

I need to identify if theres a space between a number and comma then that number is invalid. So if the number has more or less than 2 decimal places and/or white spaces in between the commas then it is INVALID but if it has no whitespaces in between the commas and has 2 decimal places then it it a VALID number. That's why the first number in Line 1 is VALID
There's two methods, I prefer to work on method 2 but I thought if I put two methods it might help any of you to add on
#-----------Method 1------------------------------------------
res = 0
outfile = "output2.txt"
baconFile = open(outfile,"wt")
index = 0
invalid_string = "INVALID"
valid_string = "VALID"
with open('file.txt') as file:
for line in file:
carrera = ''
index = index + 1
print("Line {}: ".format(index), end='')
baconFile.write("Line {}: ".format(index))
number_list = line.strip().split(',')
for number in number_list:
if len(number.split('.')[-1]) == 2:
#res += 1
## print("VALID")
carrera = valid_string
if len(number.split('.')[-1]) != 2:
#res += 1
carrera = invalid_string
if len(number.split(',')[-1]) == " ": #checking for whitespace
carrera = invalid_string
print (carrera, end=' ')
baconFile.write(carrera + " ")
print('\n', end='')
baconFile.write('\n')
baconFile.close()
#-----------Method 2------------------------------------------
res = 0
outfile = "output2.txt"
baconFile = open(outfile,"wt")
index = 0
invalid_string = "INVALID"
valid_string = "VALID"
with open('file.txt') as file:
for line in file:
index = index + 1
o = "Line {}: ".format(index)
number_list = line.strip().split(',')
for x in number_list:
if len(x.split('.')[-1]) == 2:
o += valid_string + " "
if len(x.split('.')[-1]) != 2:
o += invalid_string + " "
if len(x.split(',')[-1]) == " ":
o += valid_string + " "
Here's my list of numbers in Text.file:
1,1.02, 123.0005
1.02, 1.02 , 1.02
Expected:
Line 1: INVALID VALID INVALID
Line 2: VALID INVALID INVALID (since there's spaces between the last number that's why it is INVALID)
ACTUAL:
Line 1: INVALID VALID INVALID
Line 2: VALID INVALID VALID
You can split the strings with , and decide if the string is valid or invalid based on whether the string stars with a whitespace
#Open the files
with open('file.txt') as fp:
#Extract out non-empty lines from file
lines = [line for line in fp.readlines() if line.strip()]
res = []
#Iterate over the lines
for idx, line in enumerate(lines):
#Number is valid if it doesn't start with a whitespace, has a decimal part and the decimal part is two digits long
res = ['VALID' if not item.startswith(' ') and '.' in item and len(item.split('.')[1]) == 2 else 'INVALID' for item in line.split(',')]
#Print the result
print("Line {}: {}".format(idx+1, ' '.join(res)))
The output will be
Line 1: INVALID VALID INVALID
Line 2: VALID INVALID INVALID
try this:
line="1,1.02, 123.0005"
reslt=line.split(",")
Res=" "
for i in reslt:
if " "in i:
line1="INVALID "
else:
line1="VALID "
Res +="".join(line1)
print("line1:"+Res)
READ from file :
nbline
with open('file.txt') as f:
for line in f.readlines():
print(line)
reslt=line.split(",")
Res=" "
for i in reslt:
if " "in i:
line1="INVALID "
else:
line1="VALID "
Res +="".join(line1)
nbline = nbline+1
print("line {}:{}".format(nbline,Res))
output:
line1: VALID VALID INVALID
A list comprehension based on splitting on commas, and a little string trickery would be much simpler:
line="1,1.02, 123.0005"
result = " ".join("IN"*(" " in s)+"VALID" for s in line.split(","))
print(result) # VALID VALID INVALID
With decimal.Decimal object, you can retrieve the exponent, which somehow tells you the number of decimal places (see docs):
import decimal
o += " ".join(['INVALID' if x[0] == ' ' or decimal.Decimal(x).as_tuple().exponent != -2 else 'VALID' for x in line.split(',')])
Output
#with line = "1,1.02, 123.0005"
'Line 1: INVALID VALID INVALID'
#with line = "1.02, 1.02 , 1.02"
'Line 2: VALID INVALID INVALID'

Stop replacement by pattern?

Say my file look like this:
some lines
tom
some lines
beginword a b
some lines
endword
jim
some lines
beginword x y
some lines
endword
...
Want to be:
some lines
tom
some lines
beginword ZZ b
some lines
endword
jim
some lines
beginword x y
some lines
endword
So this is my python code:
input = open("file", "r")
output = open("file_updated", "w")
dummy = ""
item = []
for line in input:
dummy += line
if line.find("tom" + "\n") != -1:
for line in input:
if line.find("beginword") != -1:
item = line.split()
dummy += item[0] + " w " + item[-1] + "\n"
else:
dummy += line
output.write(dummy)
input.close()
output.close()
It replace all lines contain "beginword", include the lines belong to "jim", how can I stop the replacement by "endword" belong to "tom"?
Use break statement
input = open("file", "r")
output = open("file_updated", "w")
dummy = ""
item = []
for line in input:
dummy += line
if line.find("tom" + "\n") != -1:
for line in input:
# check for endword and exit for loop
if line.find("endword" + "\n") == 0:
dummy += line
break
if line.find("beginword") != -1:
item = line.split()
dummy += item[0] + " w " + item[-1] + "\n"
else:
dummy += line
output.write(dummy)
input.close()
output.close()
Also, have a look at these:
reading and writing files
regular expressions

Python - how to print amount of numbers, periods, and commas in file

def showCounts(fileName):
lineCount = 0
wordCount = 0
numCount = 0
comCount = 0
dotCount = 0
with open(fileName, 'r') as f:
for line in f:
words = line.split()
lineCount += 1
wordCount += len(words)
for word in words:
# ###text = word.translate(string.punctuation)
exclude = set(string.punctuation)
text = ""
text = ''.join(ch for ch in text if ch not in exclude)
try:
if int(text) >= 0 or int(text) < 0:
numCount += 1
# elif text == ",":
# comCount += 1
# elif text == ".":
# dotCount += 1
except ValueError:
pass
print("Line count: " + str(lineCount))
print("Word count: " + str(wordCount))
print("Number count: " + str(numCount))
print("Comma count: " + str(comCount))
print("Dot count: " + str(dotCount) + "\n")
Basically it will show the number of lines and the number of words, but I can't get it to show the number of numbers, commas, and dots. I have it read a file that the user enters and then show the amount of lines and words, but for some reason it says 0 for numbers commas and dots. I commented out the part where it gave me trouble. If i remove the comma then i just get an error. thanks guys
This code loops over every character in each line, and adds 1 to its variable:
numCount = 0
dotCount = 0
commaCount = 0
lineCount = 0
wordCount = 0
fileName = 'test.txt'
with open(fileName, 'r') as f:
for line in f:
wordCount+=len(line.split())
lineCount+=1
for char in line:
if char.isdigit() == True:
numCount+=1
elif char == '.':
dotCount+=1
elif char == ',':
commaCount+=1
print("Number count: " + str(numCount))
print("Comma count: " + str(commaCount))
print("Dot count: " + str(dotCount))
print("Line count: " + str(lineCount))
print("Word count: " + str(wordCount))
Testing it out:
test.txt:
Hello, my name is B.o.b. I like biking, swimming, and running.
I am 125 years old, and I was 124 years old 1 year ago.
Regards,
B.o.b
Running:
bash-3.2$ python count.py
Number count: 7
Comma count: 5
Dot count: 7
Line count: 6
Word count: 27
bash-3.2$
Everything makes sense here, except the lineCount the reason why this is 6 is because of newlines. In my editor (nano), it adds a newline to the end of any file by default. So just imagine the text file to be this:
>>> x = open('test.txt').read()
>>> x
'Hello, my name is B.o.b. I like biking, swimming, and running.\n\nI am 125 years old, and I was 124 years old 1 year ago.\n\nRegards,\nB.o.b \n'
>>> x.count('\n')
6
>>>
Hope this helps!
For the punctuations, why not just do:
def showCounts(fileName):
...
...
with open(fileName, 'r') as fl:
f = fl.read()
comCount = f.count(',')
dotCount = f.count('.')
You could use the Counter class to take care of it you:
from collections import Counter
with open(fileName, 'r') as f:
data = f.read().strip()
lines = len(data.split('\n'))
words = len(data.split())
counts = Counter(data)
numbers = sum(v for (k,v) in counts.items() if k.isdigit())
print("Line count: {}".format(lines))
print("Word count: {}".format(words))
print("Number count: {}".format(numbers))
print("Comma count: {}".format(counts[',']))
print("Dot count: {}".format(counts['.']))

Categories

Resources