Find a specific word in a docx file

Find a specific word in a docx file - python

I am working on a group assignment to read a docx file and then output the word 'carrier' or 'carriers' with the word directly to the right of it. The output we are receiving is only 26 of the total 82 mentions of the word carrier in the document. I would prefer recommendations to what might be causing this. My hunch is that it has something to do with the For loop.
from docx import Document
emptyString = {}
tupl = ()
doc = Document('Interstate Commerce Act.docx')
for i ,paragraph in enumerate(doc.paragraphs):
text = paragraph.text
text = text.split()
#text = text.lower()
if 'carrier' in text:
next = text.index('carrier') + 1
now = text.index('carrier')
#print(text[now], text[next])
tupl = (text[now], text[next])
emptyString[i] = tupl
if 'carriers' in text:
next = text.index('carriers') + 1
now = text.index('carriers')
#print(text[now], text[next])
tupl = (text[now], text[next])
emptyString[i] = tupl
if 'Carriers' in text:
next = text.index('Carriers') + 1
now = text.index('Carriers')
#print(text[now], text[next])
tupl = (text[now], text[next])
emptyString[i] = tupl
if 'Carrier' in text:
next = text.index('Carrier') + 1
now = text.index('Carrier')
#print(text[now], text[next])
tupl = (text[now], text[next])
emptyString[i] = tupl
print(emptyString)

Your text = text.split() line is going to cause certain items to be "hidden". For example, "The carrier is a Carrier." will produce the word list:
["The", "carrier", "is", "a", "Carrier."]
Since the last item is "Carrier." and not "Carrier" it will not be found by your "exact match" test.
Perhaps better to split by word and then check whether a lowercase version includes "carrier":
words = text.split()
for i, word in enumerate(words):
if "carrier" in word.lower():
print("word %d is a match" % i)
Using the lowercase comparison avoids the need for separate tests for all the case varieties.

Related

Find and replace in string re.insensitive

I have this code to find matches in a string, im using in a search to mark the wordds that match my search, i need this to be case insensitive, the issue here is that it replaces the word by the one we search.
val_to_searchp = "this Text string has alot of teXt"
word = "TEXT"
pal2rep = str(":::")+word+str(":::")
val_to_search = re.sub(re.escape(word), pal2rep, val_to_searchp, flags=re.IGNORECASE)
this will return
"this :::TEXT::: string has alot of :::TEXT:::"
I need it to return
"this :::Text::: string has alot of :::teXt:::"
Also tryed with this but its not working very well :(
f = 0
s = 0
val_to_search = val_to_searchp
for m in re.finditer(str(word), str(val_to_searchp)):
inicio = int(m.start()+s)
fim = int(m.end()+f)
val_to_search = val_to_search[:inicio] \
+ str(":::") \
+ val_to_search[inicio:fim] \
+ str(":::") \
+ val_to_search[fim:].strip()
f = f+2
s = s+1
This is my actuall code
def findtext():
if len(str(findtext_inp.get('1.0', END)))>1:
val_to_searchp = str(respon_txt.get(1.0, END).replace(html.unescape('⛔'), "").strip())
respon_txt.delete(1.0, END)
word = str(findtext_inp.get('1.0', END).strip())
pal2rep = str(str(html.unescape('⛔'))+word+str(html.unescape('⛔')))
val_to_search = re.sub(re.escape(word), pal2rep, val_to_searchp, flags=re.IGNORECASE)
"""
f = 0
s = 0
for m in re.finditer(str(word), str(val_to_search)):
inicio = int(m.start()+s)
fim = int(m.end()+f)
val_to_search = val_to_search[:inicio] \
+ str(html.unescape('⛔')) \
+ val_to_search[inicio:fim] \
+ str(html.unescape('⛔')) \
+ val_to_search[fim:].strip()
f = f+2
s = s+1
"""
respon_txt.insert(1.0, val_to_search)#val_to_search.replace(findtext_inp.get('1.0', END).strip() , str(html.unescape('⛔')+findtext_inp.get('1.0', END).strip())+html.unescape('⛔')))

I'm sure there's a way to do this with RE but it's really trivial without the aid of that module.
val_to_searchp = "this Text string has alot of teXt\nThis also has a lot of text"
text = 'TEXT'
def func(s, txt):
txt = txt.lower()
result = []
for line in s.split('\n'):
for i, e in enumerate(t := line.split()):
if e.lower() == txt:
t[i] = f':::{e}:::'
result.append(' '.join(t))
return '\n'.join(result)
print(func(val_to_searchp, text))
Output:
this :::Text::: string has alot of :::teXt:::
This also has a lot of :::text:::

This is a rewrite of my original answer. In the comments for that answer you will see that the OP has changed his mind about how this needs to work. This now (hopefully) complies with the altered specification:
val_to_searchp = '''{\"configurationKey\":[{\"key\":\"GetMaxKeys\",\"readonly\":true,\"value\":\"20\"}'''
text = 'GetMaxKeys'
def func(s, txt):
result = []
sl = s.lower()
txt = txt.lower()
lt = len(txt)
offset = 0
while (i := sl[offset:].find(txt)) >= 0:
result.append(s[offset:i+offset])
offset += i
result.append(f':::{s[offset:offset+lt]}:::')
offset += lt
result.append(s[offset:])
return ''.join(result)
print(func(val_to_searchp, text))
Output:
{"configurationKey":[{"key":":::GetMaxKeys:::","readonly":true,"value":"20"}

A program that counts specific stings in a text, such like "climate finance"

from collections import Counter
input = 'file.txt'
CounterWords = {}
words = {}
with open(input,'r', encoding='utf-8-sig') as fh:
for line in fh:
word_list = line.replace(',','').replace('\'','').replace('.','').lower().split()
for word in word_list:
if len(word) < 6
continue
elif word not in CounterWords:
CounterWords[word] = 1
else:
CounterWords[word] = CounterWords[word] + 1
N = 50
top_words = Counter(CounterWords).most_common(N)
for word, frequency in top_words:
print("%s %d" % (word, frequency))
At the moment i am able two select the most frequent words with strings more than X characters.
The program should screen the text and count words like:
"climate finance"
"market failure"
"Paris 2015"
Amount of minimum characters per single string should be still included to prevent results such as "I and".

You can simply use your_file_content.count(your_string) :
from collections import Counter
input = 'D:\\file.txt'
import itertools
def pairwise(iterable):
# "s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)
CounterWords = {}
CounterPairs = {}
words = {}
file_content = ''
with open(input,'r', encoding='utf-8-sig', errors='ignore') as fh:
file_content = fh.read().replace('\n', ' ')
word_list = file_content.replace(',','').replace('\'','').replace('.','').lower().split()
word_list = list(dict.fromkeys(word_list)) # to remove duplicates
word_pairs_list = pairwise(word_list)
for word in word_list:
if len(word) < 6:
continue
else:
CounterWords[word] = file_content.count(word)
for pair in word_pairs_list:
CounterPairs[pair] = file_content.count(' '.join(pair))
N = 50
# for all single words :
top_words = Counter(CounterWords).most_common(N)
for word, frequency in top_words:
print("%s %d" % (word, frequency))
# for all pairs :
top_pairs = Counter(CounterPairs).most_common(N)
for pair, frequency in top_pairs:
print("%s %d" % (pair, frequency))
# for specific pairs :
print("\n%s %d" % ('climate finance', CounterPairs[('climate', 'finance')]))
pairwise function taken from : Iterate a list as pair (current, next) in Python

Check if string is exactly the same as line in file

I've been writing a Countdown program in Python, and in it. I've written this:
#Letters Game
global vowels, consonants
from random import choice, uniform
from time import sleep
from itertools import permutations
startLetter = ""
words = []
def check(word, startLetter):
fileName = startLetter + ".txt"
datafile = open(fileName)
for line in datafile:
print("Checking if", word, "is", line.lower())
if word == line.lower():
return True
return False
def generateLetters():
lettersLeft = 9
output = []
while lettersLeft >= 1:
lType = input("Vowel or consonant? (v/c)")
sleep(uniform(0.5, 1.5))
if lType not in ("v", "c"):
print("Please input v or c")
continue
elif lType == "v":
letter = choice(vowels)
print("Rachel has picked an", letter)
vowels.remove(letter)
output.append(letter)
elif lType == "c":
letter = choice(consonants)
print("Rachel has picked a", letter)
consonants.remove(letter)
output.append(letter)
print("Letters so far:", output)
lettersLeft -= 1
return output
def possibleWords(letters, words):
for i in range(1,9):
print(letters)
print(i)
for item in permutations(letters, i):
item = "".join(list(item))
startLetter = list(item)[0]
if check(item, startLetter):
print("\n\n***Got one***\n", item)
words.append(item)
return words
vowels = ["a"]*15 + ["e"]*21 + ["i"]*13 + ["o"]*13+ ["u"]*5
consonants = ["b"]*2 + ["c"]*3 + ["d"]*6 + ["f"]*2 + ["g"]*3 +["h"]*2 +["j"]*1 +["k"]*1 +["l"]*5 +["m"]*4 +["n"]*8 +["p"]*4 +["q"]*1 +["r"]*9 +["s"]*9 +["t"]*9 + ["v"]*1 +["w"]*1 +["x"]*1 +["y"]*1 +["z"]*1
print("***Let's play a letters game!***")
sleep(3)
letters = generateLetters()
sleep(uniform(1, 1.5))
print("\n\n***Let's play countdown***\n\n\n\n\n")
print(letters)
for count in reversed(range(1, 31)):
print(count)
sleep(1)
print("\n\nStop!")
print("All possible words:")
print(possibleWords(letters, words))
'''
#Code for sorting the dictionary into files
alphabet = "abcdefghijklmnopqrstuvwxyz"
alphabet = list(alphabet)
for letter in alphabet:
allFile = open("Dictionary.txt", "r+")
filename = letter + ".txt"
letterFile = open(filename, "w")
for line in allFile:
if len(list(line.lower())) <= 9:
if list(line.lower())[0] == letter:
print("Writing:", line.lower())
letterFile.write(line.lower())
allFile.close()
letterFile.close()
I have 26 text files called a.txt, b.txt, c.txt... to make the search quicker
(Sorry it's not very neat - I haven't finished it yet)
However, instead of returning what I expect (pan), it returns all words with pan in it (pan, pancake, pans, pandemic...)
Is there any way in Python you can only return the line if it's EXACTLY the same as the string? Do I have to .read() the file first?
Thanks

Your post is strangely written so excuse me if I missmatch
Is there any way in Python you can only return the line if it's EXACTLY the same as the string? Do I have to .read() the file first?
Yes, there is!!!
file = open("file.txt")
content = file.read() # which is a str
lines = content.split('\n') # which is a list (containing every lines)
test_string = " pan "
positive_match = [l for l in lines if test_string in l]
This is a bit hacky since we avoid getting pancake for pan (for instance) but using spaces (and then, what about cases like ".....,pan "?). You should have a look at tokenization function. As pythonists, we hve one of the best library for this: nltk
(because, basically, you are reinventing the wheel)

Python : How to translate?

the program is when user input"8#15#23###23#1#19###9#20"
output should be "HOW WAS IT"
However,it could not work to show space(###).
enter code here
ABSTRACT ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
ABSTRACT_SHIFTED = {value:key for key,value in ABSTRACT.items()}
def from_abstract(s):
result = ''
for word in s.split('*'):
result = result +ABSTRACT_SHIFTED.get(word)
return result

This would do the trick:
#!/usr/bin/env python
InputString = "8#15#23###23#1#19###9#20"
InputString = InputString.replace("###", "##")
InputString = InputString.split("#")
DecodedMessage = ""
for NumericRepresentation in InputString:
if NumericRepresentation == "":
NumericRepresentation = " "
DecodedMessage += NumericRepresentation
continue
else:
DecodedMessage += chr(int(NumericRepresentation) + 64)
print(DecodedMessage)
Prints:
HOW WAS IT

you can also use a regex
import re
replacer ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
reversed = {value:key for key,value in replacer.items()}
# Reversed because regex is greedy and it will match 1 before 15
target = '8#15#23###23#1#19###9#20'
pattern = '|'.join(map(lambda x: x + '+', list(reversed.keys())[::-1]))
repl = lambda x: reversed[x.group(0)]
print(re.sub(pattern, string=target, repl=repl))
And prints:
HOW WAS IT

With a couple minimal changes to your code it works.
1) split on '#', not '*'
2) retrieve ' ' by default if a match isn't found
3) use '##' instead of '###'
def from_abstract(s):
result = ''
for word in s.replace('###','##').split('#'):
result = result +ABSTRACT_SHIFTED.get(word," ")
return result

Swap the key-value pairs of ABSTRACT and use simple split + join on input
ip = "8#15#23###23#1#19###9#20"
ABSTRACT = dict((v,k) for k,v in ABSTRACT.items())
''.join(ABSTRACT.get(i,' ') for i in ip.split('#')).replace(' ', ' ')
#'HOW WAS IT'

The biggest challenge here is that "#" is used as a token separator and as the space character, you have to know the context to tell which you've got at any given time, and that makes it difficult to simply split the string. So write a simple parser. This one will accept anything as the first character in a token and then grab everything until it sees the next "#".
ABSTRACT ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
ABSTRACT_SHIFTED = {value:key for key,value in ABSTRACT.items()}
user_input = "8#15#23###23#1#19###9#20"
def from_abstract(s):
result = []
while s:
print 'try', s
# tokens are terminated with #
idx = s.find("#")
# ...except at end of line
if idx == -1:
idx = len(s) - 1
token = s[:idx]
s = s[idx+1:]
result.append(ABSTRACT_SHIFTED.get(token, ' '))
return ''.join(result)
print from_abstract(user_input)

Word count with pattern in Python

So this is the question:
Write a program to read in multiple lines of text and count the number
of words in which the rule i before e, except after c is broken, and
number of words which contain either ei or ie and which don't break
the rule.
For this question, we only care about the c if it is the character
immediately before the ie or the ei. So science counts as breaking the
rule, but mischievous doesn't. If a word breaks the rule twice (like
obeisancies), then it should still only be counted once.
Example given:
Line: The science heist succeeded
Line: challenge accepted
Line:
Number of times the rule helped: 0
Number of times the rule was broken: 2
and my code:
rule = []
broken = []
line = None
while line != '':
line = input('Line: ')
line.replace('cie', 'broken')
line.replace('cei', 'rule')
line.replace('ie', 'rule')
line.replace('ei', 'broken')
a = line.count('rule')
b = line.count('broken')
rule.append(a)
broken.append(b)
print(sum(a)); print(sum(b))
How do I fix my code, to work like the question wants it to?

I'm not going to write the code to your exact specification as it sounds like homework but this should help:
import pprint
words = ['science', 'believe', 'die', 'friend', 'ceiling',
'receipt', 'seize', 'weird', 'vein', 'foreign']
rule = {}
rule['ie'] = []
rule['ei'] = []
rule['cei'] = []
rule['cie'] = []
for word in words:
if 'ie' in word:
if 'cie' in word:
rule['cie'].append(word)
else:
rule['ie'].append(word)
if 'ei' in word:
if 'cei' in word:
rule['cei'].append(word)
else:
rule['ei'].append(word)
pprint.pprint(rule)
Save it to a file like i_before_e.py and run python i_before_e.py:
{'cei': ['ceiling', 'receipt'],
'cie': ['science'],
'ei': ['seize', 'weird', 'vein', 'foreign'],
'ie': ['believe', 'die', 'friend']}
You can easily count the occurrences with:
for key in rule.keys():
print "%s occured %d times." % (key, len(rule[key]))
Output:
ei occured 4 times.
ie occured 3 times.
cie occured 1 times.
cei occured 2 times.

Firstly, replace does not chance stuff in place. What you need is the return value:
line = 'hello there' # line = 'hello there'
line.replace('there','bob') # line = 'hello there'
line = line.replace('there','bob') # line = 'hello bob'
Also I would assume you want actual totals so:
print('Number of times the rule helped: {0}'.format(sum(rule)))
print('Number of times the rule was broken: {0}'.format(sum(broken)))
You are printing a and b. These are the numbers of times the rule worked and was broken in the last line processed. You want totals.
As a sidenote: Regular expressions are good for things like this. re.findall would make this a lot more sturdy and pretty:
line = 'foo moo goo loo foobar cheese is great '
foo_matches = len(re.findall('foo', line)) # = 2

Let's split the logic up into functions, that should help us reason about the code and get it right. To loop over the line, we can use the iter function:
def rule_applies(word):
return 'ei' in word or 'ie' in word
def complies_with_rule(word):
if 'cie' in word:
return False
if word.count('ei') > word.count('cei'):
return False
return True
helped_count = 0
broken_count = 0
lines = iter(lambda: input("Line: "), '')
for line in lines:
for word in line.split():
if rule_applies(word):
if complies_with_rule(word):
helped_count += 1
else:
broken_count += 1
print("Number of times the rule helped:", helped_count)
print("Number of times the rule was broken:", broken_count)
We can make the code more concise by shortening the complies_with_rule function and by using generator expressions and Counter:
from collections import Counter
def rule_applies(word):
return 'ei' in word or 'ie' in word
def complies_with_rule(word):
return 'cie' not in word and word.count('ei') == word.count('cei')
lines = iter(lambda: input("Line: "), '')
words = (word for line in lines for word in line.split())
words_considered = (word for word in words if rule_applies(word))
did_rule_help_count = Counter(complies_with_rule(word) for word in words_considered)
print("Number of times the rule helped:", did_rule_help_count[True])
print("Number of times the rule was broken:", did_rule_help_count[False])

If I understand correctly, your main problematic is to get unique result per word. Is that what you try to achieve:
rule_count = 0
break_count = 0
line = None
while line != '':
line = input('Line: ')
rule_found = False
break_found = False
for word in line.split():
if 'cie' in line:
line = line.replace('cie', '')
break_found = True
if 'cei' in line:
line = line.replace('cei', '')
rule_found = True
if 'ie' in line:
rule_found = True
if 'ei' in line:
break_found = True
if rule_found:
rule_count += 1
if break_found:
break_count += 1
print(rule_found); print(break_count)

rule = []
broken = []
tb = 0
tr = 0
line = ' '
while line:
lines = input('Line: ')
line = lines.split()
for word in line:
if 'ie' in word:
if 'cie' in word:
tb += 1
elif word.count('cie') > 1:
tb += 1
elif word.count('ie') > 1:
tr += 1
elif 'ie' in word:
tr += 1
if 'ei' in word:
if 'cei' in word:
tr += 1
elif word.count('cei') > 1:
tr += 1
elif word.count('ei') > 1:
tb += 1
elif 'ei' in word:
tb += 1
print('Number of times the rule helped: {0}'.format(tr))
print('Number of times the rule was broken: {0}'.format(tb))
Done.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Find a specific word in a docx file - python

Related

Find and replace in string re.insensitive

A program that counts specific stings in a text, such like "climate finance"

Check if string is exactly the same as line in file

Python : How to translate?

Word count with pattern in Python

Categories

Resources