Creating personal wiki reading my own file - python

I'm trying to create a personal wiki that instead of searching info from a wikipedia API with the summary function, it search my own info that is related to a key in a file (key-value) that I create. What should I do to make it work?
def _(text):
try:
textarea.delete('1.0', END)
text = text.lower()
value = wikipedia.summary(text, sentences=15)
value = value + '\n\n'
for word in value:
textarea.insert(END, word)
except wikipedia.DisambiguationError as e:
s = random.choice(e.options)
s = s.lower()
p = wikipedia.summary(s)
p = p + '\n\n'
for word in p:
textarea.insert(END, word)
Thank you in advance!

Related

Create a link to a specific word count position such as bookmark in docx

How this project works:
Searches external docx / OCR data for a keyword
Builds a context of 100 words surrounding the keyword
Builds a docx to store the passage with a hyperlink posted under each completed search
What is missing:
A way to link to the passage to its source from the external document in Word, so you can just use a hyperlink to it, but the problem is the OCR docx files read have no headings to bookmark a run, and I could not create them with long OCR, so it is not manageable from the aspect of going in to the docx file one by one reading gibberish at times.
So Word needs to be able to store the solution in the document where the passage is printed in the new file. This hyperlink code works... I need something more than what I have here to find the passage locations on its source, unless MS Word will not support such a specific function as finding the indexed word position of the passage? Can I build a macro and call it in python to make a link and run its position using the index?
Hyperlinking/bookmark code post ref:
def add_hyperlink(paragraph, text, url):
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )
# Create a w:r element and a new w:rPr element
new_run = docx.oxml.shared.OxmlElement('w:r')
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# Join all the xml elements together add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
# Create a new Run object and add the hyperlink into it
r = paragraph.add_run()
r._r.append(hyperlink)
# A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
# Delete this if using a template that has the hyperlink style in it
r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
r.font.underline = True
return hyperlink
def extract_surround_words(text, keyword, n):
'''
text : input text
keyword : the search keyword we are looking
n : number of words around the keyword
'''
# extracting all the words from text
words = re.findall(r'\w+', text)
passage = []
passageText = ''
saveIndex = []
passagePos = []
indexVal = ''
document = Document()
document.add_heading("The keyword searched is: " + searchKeyword + ", WORD COUNT: " + str(len(text)) + "\n", 0)
# iterate through all the words
for index, word in enumerate(words):
# check if search keyword matches
if word == keyword and len(words) > 0:
saveIndex.append(str(index-n))
# fetch left side words and right
passage = words[index - n: index] #start text run
passage.append(keyword)
passage += words[index + 1: index + n + 1] #end of run
passagePos = "\nWORD COUNT POSITION: " + str(saveIndex.pop() + "\n")
bookmark = add_bookmark(index, passagePos)
print(str(passagePos))
for wd in passage:
passageText += ' ' + wd
parag = document.add_paragraph(passageText)
add_hyperlink(parag, passagePos, os.path.join(path, file))
passage.append("\n\n")
document.save(os.path.join(output_path, out_file_doc))
return passageText

missing some text when parsing article content with lxml, what is the issue?

I am using the following code to parse an article from a french news site. When getting all the paragraphs, i keep missing some text. why is that?
Here is my code: the code with the XX is the most relevant the other parts is just me putting it in my own structure for use.
def getWordList(sent,wordList):
listOfWords = list((sent).split())
for i in listOfWords:
i = i.replace("."," ")
i = i.replace(","," ")
i = i.replace('\"'," ")
valids = re.sub(r"[^A-Za-z]+", '', i)
if(len(i) > 3 and (i.lower() not in stopWords) and i.isnumeric() !=
True and valids):
wordList[valids] = {}
wordList[valids]["definition"] = ""
wordList[valids]["status"] = ""
def parse(link):
page = requests.get(link)
tree = html.fromstring(page.content)
XXword = tree.xpath('//*[#class="article__content old__article-content-single"]')
articleContent = {}
articleContent["words"] = {}
articleContent["language"] = "French";
wordList = articleContent["words"]
contentList = []
XXpTag = word[0].xpath('//*')
pText = {}
for x in range(len(pTag)):
#print(pTag[x].get("class"))
if(pTag[x].text != None):
if(pTag[x].tail != None):
print("tail")
XXtext = pTag[x].text + pTag[x].tail
else:
print("no tail")
XXtext = pTag[x].text
XXif(pTag[x].get("class") == "article__paragraph "):
print(pTag[x].get("class"))
print(text)
getWordList(text,wordList)
pText[text] = {}
pText[text]["status"] = ""
pText[text]["type"] = "p"
XXelif(pTag[x].get("class") == "article__sub-title"):
print(pTag[x].get("class"))
getWordList(text,wordList)
pText[text] = {}
pText[text]["status"] = ""
pText[text]["type"] = "h2"
here is an example article link: https://www.lemonde.fr/economie/article/2019/05/23/vivendi-chercherait-a-ceder-universal-music-group-au-chinois-tencent_5466130_3234.html
I am successfully getting all the highlighted text but the rest is missing,not the text in the middle i am successfully avoiding that. I just want the text in between which is not being included.
Thank you for your help!!
You're trying to get the content of tags containing other tags. For example, there are <em> emphasized text tags in the <p> paragraph tags.
Use the text_content() method instead of text to get the full content of your paragraphs:
text = pTag[x].text_content() + pTag[x].tail
and
text = pTag[x].text_content()

How to find and match each elements of a list on each sentences?

I have a file including some sentences. I used polyglot for Named Entity Recognition and stored all detected entities in a list. Now I want to check if in each sentence any or pair of entities exist, show that for me.
Here what I did:
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
test = Text(input_file, hint_language_code='fa')
list_entity = []
for sent in test.sentences:
#print(sent[:10], "\n")
for entity in test.entities:
list_entity.append(entity)
for i in range(len(test)):
m = test.entities[i]
n = test.words[m.start: m.end] # it shows only word not tag
if str(n).split('.')[-1] in test: # if each entities exist in each sentence
print(n)
It gives me an empty list.
Input:
sentence1: Bill Gate is the founder of Microsoft.
sentence2: Trump is the president of USA.
Expected output:
Bill Gate, Microsoft
Trump, USA
Output of list_entity:
I-PER(['Trump']), I-LOC(['USA'])
How to check if I-PER(['Trump']), I-LOC(['USA']) is in first sentence?
For starters you were adding the whole text file input to the entities list.
entities can only be called by each sentence in the polyglot object.
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='fa')
list_entity = []
for sentence in file.sentences:
for entity in sentence.entities:
#print(entity)
list_entity.append(entity)
print(list_entity)
Now you don't have an empty list.
As for your problem with identifying the identity terms,
I have not found a way to generate an entity by hand, so the following simply checks if there are entities with the same term. A Chunk can have multiple strings inside, so we can go through them iteratively.
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='ar')
def check_sentence(entities_list, sentence): ## Check if string terms
for term in entities_list: ## are in any of the entities
## Compare each Chunk in the list to each Chunk
## object in the sentence and see if there's any matches.
if any(any(entityTerm == term for entityTerm in entityObject)
for entityObject in sentence.entities):
pass
else:
return False
return True
sentence_number = 1 # Which sentence to check
sentence = file.sentences[sentence_number]
entity_terms = ["Bill",
"Gates"]
if check_sentence(entity_terms, sentence):
print("Entity Terms " + str(entity_terms) +
" are in the sentence. '" + str(sentence)+ "'")
else:
print("Sentence '" + str(sentence) +
"' doesn't contain terms" + str(entity_terms ))
Once you find a way to generate arbitrary entities all you'll have to do is stop popping the term from the sentence checker so you can do type comparison as well.
If you just want to match the list of entities in the file against a specific sentence, then this should do the trick:
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='fa')
def return_match(entities_list, sentence): ## Check if and which chunks
matches = [] ## are in the sentence
for term in entities_list:
## Check each list in each Chunk object
## and see if there's any matches.
for entity in sentence.entities:
if entity == term:
for word in entity:
matches.append(word)
return matches
def return_list_of_entities(file):
list_entity = []
for sentence in file.sentences:
for entity in sentence.entities:
list_entity.append(entity)
return list_entity
list_entity = return_list_of_entities(file)
sentence_number = 1 # Which sentence to check
sentence = file.sentences[sentence_number]
match = return_match(list_entity, sentence)
if match:
print("Entity Term " + str(match) +
" is in the sentence. '" + str(sentence)+ "'")
else:
print("Sentence '" + str(sentence) +
"' doesn't contain any of the terms" + str(list_entity))

Unique string before writing to file - python

Why can't this work? I want to unique the results I get from the Rest api before I write it to the file --
MISP_HOST="https://192.168.1.8"
API_KEY="asdfasdfas"
EXPORT_DATA="attributes/text/download/md5"
OUTPUT_FILE="md5-"+today
def main():
URL="%s/%s" % (MISP_HOST, EXPORT_DATA)
request = urllib2.Request(URL)
f = open(OUTPUT_FILE,'w')
request.add_header('Authorization', API_KEY)
data = urllib2.urlopen(request).read()
set(data)
print type(data)
f.write(data)
f.close()
It work with no errors but the data is definitely not unique. I'm trying not to do this in bash. Could you explain the why it doesn't work too? Many thanks!!!
If your result is plain text, you can use a regular expression to find all of the words in the text and then build a set from there. This example also lower cases the words so that the set is case insensitive and writes each word on its own line.
import re
MISP_HOST="https://192.168.1.8"
API_KEY="asdfasdfas"
EXPORT_DATA="attributes/text/download/md5"
OUTPUT_FILE="md5-"+today
def main():
URL="%s/%s" % (MISP_HOST, EXPORT_DATA)
request = urllib2.Request(URL)
f = open(OUTPUT_FILE,'w')
request.add_header('Authorization', API_KEY)
data = urllib2.urlopen(request).read()
unique = set(word.lower() for word in re.findall(r'\w+', data))
# that could be expanded to
# wordlist = re.findall(r'\w+', data)
# unique = set(word.lower() for word in wordlist)
print type(unique)
f.write('\n'.join(unique))
f.close()

Indexing and search in a text file

I have a text file that contains the contents of a book. I want to take this file and build an index which allows the user to search through the file to make searches.
The search would consist of entering a word. Then, the program would return the following:
Every chapter which includes that word.
The line number of the line
which contains the word.
The entire line the word is on.
I tried the following code:
infile = open(file)
Dict = {}
word = input("Enter a word to search: ")
linenum = 0
line = infile.readline()
for line in infile
linenum += 1
for word in wordList:
if word in line:
Dict[word] = Dict.setdefault(word, []) + [linenum]
print(count, word)
line = infile.readline()
return Dict
Something like this does not work and seems too awkward for handling the other modules which would require:
An "or" operator to search for one word or another
An "and" operator to search for one word and another in the same chapter
Any suggestions would be great.
def classify_lines_on_chapter(book_contents):
lines_vs_chapter = []
for line in book_contents:
if line.isupper():
current_chapter = line.strip()
lines_vs_chapter.append(current_chapter)
return lines_vs_chapter
def classify_words_on_lines(book_contents):
words_vs_lines = {}
for i, line in enumerate(book_contents):
for word in set([word.strip(string.punctuation) for word in line.split()]):
if word:
words_vs_lines.setdefault(word, []).append(i)
return words_vs_lines
def main():
skip_lines = 93
with open('book.txt') as book:
book_contents = book.readlines()[skip_lines:]
lines_vs_chapter = classify_lines_on_chapter(book_contents)
words_vs_lines = classify_words_on_lines(book_contents)
while True:
word = input("Enter word to search - ")
# Enter a blank input to exit
if not word:
break
line_numbers = words_vs_lines.get(word, None)
if not line_numbers:
print("Word not found!!\n")
continue
for line_number in line_numbers:
line = book_contents[line_number]
chapter = lines_vs_chapter[line_number]
print("Line " + str(line_number + 1 + skip_lines))
print("Chapter '" + str(chapter) + "'")
print(line)
if __name__ == '__main__':
main()
Try it on this input file. Rename it as book.txt before running it.

Categories

Resources