import re
#list of names to identify in input strings
result_list = ['Thomas Edd', 'Melissa Clark', 'Ada White', 'Louis Pasteur', 'Edd Thomas', 'Clark Melissa', 'White Eda', 'Pasteur Louis', 'Thomas', 'Melissa', 'Ada', 'Louis', 'Edd', 'Clark', 'White', 'Pasteur']
result_list.sort() # sorts normally by alphabetical order (optional)
result_list.sort(key=len, reverse=True) # sorts by descending length
#example 1
input_text = "Melissa went for a walk in the park, then Melisa Clark went to the cosmetics store. There Thomas showed her a wide variety of cosmetic products. Edd Thomas is a great salesman, even so Thomas Edd is a skilled but responsible salesman, as Edd is always honest with his customers. White is a new client who came to Edd's business due to the good social media reviews she saw from Melissa, her co-worker."
#In this example 2, it is almost the same however, some of the names were already encapsulated
# under the ((PERS)name) structure, and should not be encapsulated again.
input_text = "((PERS)Melissa) went for a walk in the park, then Melisa Clark went to the cosmetics store. There Thomas showed her a wide variety of cosmetic products. Edd Thomas is a great salesman, even so ((PERS)Thomas Edd) is a skilled but responsible salesman, as Edd is always honest with his customers. White is a new client who came to Edd's business due to the good social media reviews she saw from Melissa, her co-worker." #example 1
for i in result_list:
input_text = re.sub(r"\(\(PERS\)" + r"(" + str(i) + r")" + r"\)",
lambda m: (f"((PERS){m[1]})"),
input_text)
print(repr(input_text)) # --> output
Note that the names meet certain conditions under which they must be identified, that is, they must be in the middle of 2 whitespaces \s*the searched name\s* or be at the beginning (?:(?<=\s)|^) or/and at the end of the input string.
It may also be the case that a name is followed by a comma, for example "Ada White, Melissa and Louis went shopping" or if spaces are accidentally omitted "Ada White,Melissa and Louis went shopping".
For this reason it is important that after [.,;] the possibility that it does find a name.
Cases where the names should NOT be encapsulated, would be for example...
"the Edd's business"
"The whitespace"
"the pasteurization process takes time"
"Those White-spaces in that text are unnecessary"
, since in these cases the name is followed or preceded by another word that should not be part of the name that is being searched for.
For examples 1 and 2 (note that example 2 is the same as example 1 but already has some encapsulated names and you have to prevent them from being encapsulated again), you should get the following output.
"((PERS)Melissa) went for a walk in the park, then ((PERS)Melisa Clark) went to the cosmetics store. There ((PERS)Thomas) showed her a wide variety of cosmetic products. ((PERS)Edd Thomas) is a great salesman, even so ((PERS)Thomas Edd) is a skilled but responsible salesman, as ((PERS)Edd) is always honest with his customers. ((PERS)White) is a new client who came to Edd's business due to the good social media reviews she saw from ((PERS)Melissa), her co-worker."
You can use lookarounds to exclude already encapsulated names and those followed by ', an alphanumeric character or -:
import re
result_list = ['Thomas Edd', 'Melissa Clark', 'Ada White', 'Louis Pasteur', 'Edd Thomas', 'Clark Melissa', 'White Eda', 'Pasteur Louis', 'Thomas', 'Melissa', 'Ada', 'Louis', 'Edd', 'Clark', 'White', 'Pasteur']
result_list.sort(key=len, reverse=True) # sorts by descending length
input_text = "((PERS)Melissa) went for a walk in the park, then Melissa Clark went to the cosmetics store. There Thomas showed her a wide variety of cosmetic products. Edd Thomas is a great salesman, even so ((PERS)Thomas Edd) is a skilled but responsible salesman, as Edd is always honest with his customers. White is a new client who came to Edd's business due to the good social media reviews she saw from Melissa, her co-worker." #example 1
pat = re.compile(rf"(?<!\(PERS\))({'|'.join(result_list)})(?!['\w)-])")
input_text = re.sub(pat, r'((PERS)\1)', input_text)
Output:
((PERS)Melissa) went for a walk in the park, then ((PERS)Melissa Clark) went to the cosmetics store. There ((PERS)Thomas) showed her a wide variety of cosmetic products. ((PERS)Edd Thomas) is a great salesman, even so ((PERS)Thomas Edd) is a skilled but responsible salesman, as ((PERS)Edd) is always honest with his customers. ((PERS)White) is a new client who came to Edd's business due to the good social media reviews she saw from ((PERS)Melissa), her co-worker.
Of course you can refine the content of your lookahead based on further edge cases.
from datasets import load_dataset #Huggingface
from transformers import BertTokenizer #Huggingface:
def tokenized_dataset(dataset):
""" Method that tokenizes each document in the train, test and validation dataset
Args:
dataset (DatasetDict): dataset that will be tokenized (train, test, validation)
Returns:
dict: dataset once tokenized
"""
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encode = lambda document: tokenizer(document, return_tensors='pt', padding=True, truncation=True)
train_articles = [encode(document) for document in dataset["train"]["article"]]
test_articles = [encode(document) for document in dataset["test"]["article"]]
val_articles = [encode(document) for document in dataset["val"]["article"]]
train_abstracts = [encode(document) for document in dataset["train"]["abstract"]]
test_abstracts = [encode(document) for document in dataset["test"]["abstract"]]
val_abstracts = [encode(document) for document in dataset["val"]["abstract"]]
return {"train": (train_articles, train_abstracts),
"test": (test_articles, test_abstracts),
"val": (val_articles, val_abstracts)}
if __name__ == "__main__":
dataset = load_data("./train/", "./test/", "./val/", "./.cache_dir")
tokenized_data = tokenized_dataset(dataset)
I would like to modify the function tokenized_dataset because it creates a very heavy dictionary. The dataset created by that function will be reused for a ML training. However, dragging that dictionary during the training will slow down the training a lot.
Be aware that document is similar to
[['eleven politicians from 7 parties made comments in letter to a newspaper .',
"said dpp alison saunders had ` damaged public confidence ' in justice .",
'ms saunders ruled lord janner unfit to stand trial over child abuse claims .',
'the cps has pursued at least 19 suspected paedophiles with dementia .'],
['an increasing number of surveys claim to reveal what makes us happiest .',
'but are these generic lists really of any use to us ?',
'janet street-porter makes her own list - of things making her unhappy !'],
["author of ` into the wild ' spoke to five rape victims in missoula , montana .",
"` missoula : rape and the justice system in a college town ' was released april 21 .",
"three of five victims profiled in the book sat down with abc 's nightline wednesday night .",
'kelsey belnap , allison huguet and hillary mclaughlin said they had been raped by university of montana football '
'players .',
"huguet and mclaughlin 's attacker , beau donaldson , pleaded guilty to rape in 2012 and was sentenced to 10 years .",
'belnap claimed four players gang-raped her in 2010 , but prosecutors never charged them citing lack of probable '
'cause .',
'mr krakauer wrote book after realizing close friend was a rape victim .'],
['tesco announced a record annual loss of £ 6.38 billion yesterday .',
'drop in sales , one-off costs and pensions blamed for financial loss .',
'supermarket giant now under pressure to close 200 stores nationwide .',
'here , retail industry veterans , plus mail writers , identify what went wrong .'],
...,
['snp leader said alex salmond did not field questions over his family .',
"said she was not ` moaning ' but also attacked criticism of women 's looks .",
'she made the remarks in latest programme profiling the main party leaders .',
'ms sturgeon also revealed her tv habits and recent image makeover .',
'she said she relaxed by eating steak and chips on a saturday night .']]
So in the dictionary, the keys are just strings, but the values are all list of lists of strings. Instead of having value = list of list of strings, I think it is better to create sort of a list of object function instead of having list of lists of strings. It will make the dictionary much lighter. How can I do that?
EDIT
For me, there is a difference between copying a list of lists of strings and copying a list of objects. Copying an object will simply copy the reference while copying a list of strings will copy everything. So it is much faster to copy the reference instead. This is the point of this question.
I want to do a noisy resolution such that given a personal prounoun, that pronoun is replace by the previous(nearest) person.
For example:
Alex is looking at buying a U.K. startup for $1 billion. He is very confident that this is going to happen. Sussan is also in the same situation. However, she has lost hope.
the output is:
Alex is looking at buying a U.K. startup for $1 billion. Alex is very confident that this is going to happen. Sussan is also in the same situation. However, Susan has lost hope.
Another example,
Peter is a friend of Gates. But Gates does not like him.
In this case, the output would be :
Peter is a friend of Gates. But Gates does not like Gates.
Yes! This is super noisy.
Using spacy:
I have extracted the Person using NER, but how can I replace pronouns appropriately?
Code:
import spacy
nlp = spacy.load("en_core_web_sm")
for ent in doc.ents:
if ent.label_ == 'PERSON':
print(ent.text, ent.label_)
There is specially dedicated neuralcoref library to resolve coreference. See the minimal reproducible example below:
import spacy
import neuralcoref
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)
doc = nlp(
'''Alex is looking at buying a U.K. startup for $1 billion.
He is very confident that this is going to happen.
Sussan is also in the same situation.
However, she has lost hope.
Peter is a friend of Gates. But Gates does not like him.
''')
print(doc._.coref_resolved)
Alex is looking at buying a U.K. startup for $1 billion.
Alex is very confident that this is going to happen.
Sussan is also in the same situation.
However, Sussan has lost hope.
Peter is a friend of Gates. But Gates does not like Peter.
Note, you may have some issues with neuralcoref if you pip install it, so it's better to build it from source, as I outlined it here
I have written a function that works for your two examples:
Consider using a larger model such as en_core_web_lg for more accurate tagging.
import spacy
from string import punctuation
nlp = spacy.load("en_core_web_lg")
def pronoun_coref(text):
doc = nlp(text)
pronouns = [(tok, tok.i) for tok in doc if (tok.tag_ == "PRP")]
names = [(ent.text, ent[0].i) for ent in doc.ents if ent.label_ == 'PERSON']
doc = [tok.text_with_ws for tok in doc]
for p in pronouns:
replace = max(filter(lambda x: x[1] < p[1], names),
key=lambda x: x[1], default=False)
if replace:
replace = replace[0]
if doc[p[1] - 1] in punctuation:
replace = ' ' + replace
if doc[p[1] + 1] not in punctuation:
replace = replace + ' '
doc[p[1]] = replace
doc = ''.join(doc)
return doc
I have text data to be cleaned using regex. However, some words in the text are immediately followed by numbers which I want to remove.
For example, one row of the text is:
Preface2 Contributors4 Abrreviations5 Acknowledgements8 Pes
terminology10 Lessons learnt from the RUPES project12 Payment for
environmental service and it potential and example in Vietnam16
Chapter Integrating payment for ecosystem service into Vietnams policy
and programmes17 Chapter Creating incentive for Tri An watershed
protection20 Chapter Sustainable financing for landscape beauty in
Bach Ma National Park 24 Chapter Building payment mechanism for carbon
sequestration in forestry a pilot project in Cao Phong district of Hoa
Binh province Vietnam26 Chapter 5 Local revenue sharing Nha Trang Bay
Marine Protected Area Vietnam28 Synthesis and Recommendations30
References32
The first word in the above text should be 'preface' instead of 'preface2' and so on.
line = re.sub(r"[A-Za-z]+(\d+)", "", line)
This, however removes the words as well as seen:
Pes Lessons learnt from the RUPES Payment for environmental service
and it potential and example in Chapter Integrating payment for
ecosystem service into Vietnams policy and Chapter Creating incentive
for Tri An watershed Chapter Sustainable financing for landscape
beauty in Bach Ma National Park 24 Chapter Building payment mechanism
for carbon sequestration in forestry a pilot project in Cao Phong
district of Hoa Binh province Chapter 5 Local revenue sharing Nha
Trang Bay Marine Protected Area Synthesis and
How can I capture only the numbers that immediately follow words?
You can capture the text part and substitute the word with that captured part. It simply writes:
re.sub(r"([A-Za-z]+)\d+", r"\1", line)
You could try lookahead assertions to check for words before your numbers. Try word boundaries (\b) at the end of forcing your regex to only match numbers at the end of a word:
re.sub(r'(?<=\w+)\d+\b', '', line)
Hope this helps
EDIT:
Sorry about the glitch, mentioned in the comments about matching numbers that are NOT preceeded by words as well. That is because (sorry again) \w matches alphanumeric characters instead of only alphabetic ones. Depending on what you would like to delete you can use the positive version
re.sub(r'(?<=[a-zA-Z])\d+\b', '', line)
to only check for english alphabetic characters (you can add characters to the [a-zA-Z] list) preceeding your number or the negative version
re.sub(r'(?<![\d\s])\d+\b', '', line)
to match anything that is NOT \d (numbers) or \s (spaces) before your desired number. This will also match punctuation marks though.
Try this:
line = re.sub(r"([A-Za-z]+)(\d+)", "\\2", line) #just keep the number
line = re.sub(r"([A-Za-z]+)(\d+)", "\\1", line) #just keep the word
line = re.sub(r"([A-Za-z]+)(\d+)", r"\2", line) #same as first one
line = re.sub(r"([A-Za-z]+)(\d+)", r"\1", line) #same as second one
\\1 will match the word, \\2 the number. See: How to use python regex to replace using captured group?
below, I'm proposing a working sample of code that might solve your problem.
Here's the snippet:
import re
# I'will write a function that take the test data as input and return the
# desired result as stated in your question.
def transform(data):
"""Replace in a text data words ending with number.""""
# first, lest construct a pattern matching those words we're looking for
pattern1 = r"([A-Za-z]+\d+)"
# Lest construct another pattern that will replace the previous in the final
# output.
pattern2 = r"\d+$"
# Let find all matching words
matches = re.findall(pattern1, data)
# Let construct a list of replacement for each word
replacements = []
for match in matches:
replacements.append(pattern2, '', match)
# Intermediate variable to construct tuple of (word, replacement) for
# use in string method 'replace'
changers = zip(matches, replacements)
# We now recursively change every appropriate word matched.
output = data
for changer in changers:
output.replace(*changer)
# The work is done, we can return the result
return output
For test purpose, we run the above function with your test data:
data = """
Preface2 Contributors4 Abrreviations5 Acknowledgements8 Pes terminology10 Lessons
learnt from the RUPES project12 Payment for environmental service and it potential and
example in Vietnam16 Chapter Integrating payment for ecosystem service into Vietnams
policy and programmes17 Chapter Creating incentive for Tri An watershed protection20
Chapter Sustainable financing for landscape beauty in Bach Ma National Park 24 Chapter
Building payment mechanism for carbon sequestration in forestry a pilot project in Cao
Phong district of Hoa Binh province Vietnam26 Chapter 5 Local revenue sharing Nha Trang
Bay Marine Protected Area Vietnam28 Synthesis and Recommendations30 References32
"""
result = transform(data)
print(result)
And the result looks like this:
Preface Contributors Abrreviations Acknowledgements Pes terminology Lessons learnt from
the RUPES project Payment for environmental service and it potential and example in
Vietnam Chapter Integrating payment for ecosystem service into Vietnams policy and
programmes Chapter Creating incentive for Tri An watershed protection Chapter
Sustainable financing for landscape beauty in Bach Ma National Park 24 Chapter Building
payment mechanism for carbon sequestration in forestry a pilot project in Cao Phong
district of Hoa Binh province Vietnam Chapter 5 Local revenue sharing Nha Trang Bay
Marine Protected Area Vietnam Synthesis and Recommendations References
You can create a range of numbers as well:
re.sub(r"[0-9]", "", line)
This question already has answers here:
How can I split a text into sentences?
(20 answers)
Closed 3 years ago.
I want to make a list of sentences from a string and then print them out. I don't want to use NLTK to do this. So it needs to split on a period at the end of the sentence and not at decimals or abbreviations or title of a name or if the sentence has a .com This is attempt at regex that doesn't work.
import re
text = """\
Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it. Did he mind? Adam Jones Jr. thinks he didn't. In any case, this isn't true... Well, with a probability of .9 it isn't.
"""
sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
for stuff in sentences:
print(stuff)
Example output of what it should look like
Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it.
Did he mind?
Adam Jones Jr. thinks he didn't.
In any case, this isn't true...
Well, with a probability of .9 it isn't.
(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s
Try this. split your string this.You can also check demo.
http://regex101.com/r/nG1gU7/27
Ok so sentence-tokenizers are something I looked at in a little detail, using regexes, nltk, CoreNLP, spaCy. You end up writing your own and it depends on the application. This stuff is tricky and valuable and people don't just give their tokenizer code away. (Ultimately, tokenization is not a deterministic procedure, it's probabilistic, and also depends very heavily on your corpus or domain, e.g. legal/financial documents vs social-media posts vs Yelp reviews vs biomedical papers...)
In general you can't rely on one single Great White infallible regex, you have to write a function which uses several regexes (both positive and negative); also a dictionary of abbreviations, and some basic language parsing which knows that e.g. 'I', 'USA', 'FCC', 'TARP' are capitalized in English.
To illustrate how easily this can get seriously complicated, let's try to write you that functional spec for a deterministic tokenizer just to decide whether single or multiple period ('.'/'...') indicates end-of-sentence, or something else:
function isEndOfSentence(leftContext, rightContext)
Return False for decimals inside numbers or currency e.g. 1.23 , $1.23, "That's just my $.02" Consider also section references like 1.2.A.3.a, European date formats like 09.07.2014, IP addresses like 192.168.1.1, MAC addresses...
Return False (and don't tokenize into individual letters) for known abbreviations e.g. "U.S. stocks are falling" ; this requires a dictionary of known abbreviations. Anything outside that dictionary you will get wrong, unless you add code to detect unknown abbreviations like A.B.C. and add them to a list.
Ellipses '...' at ends of sentences are terminal, but in the middle of sentences are not. This is not as easy as you might think: you need to look at the left context and the right context, specifically is the RHS capitalized and again consider capitalized words like 'I' and abbreviations. Here's an example proving ambiguity which : She asked me to stay... I left an hour later. (Was that one sentence or two? Impossible to determine)
You may also want to write a few patterns to detect and reject miscellaneous non-sentence-ending uses of punctuation: emoticons :-), ASCII art, spaced ellipses . . . and other stuff esp. Twitter. (Making that adaptive is even harder). How do we tell if #midnight is a Twitter user, the show on Comedy Central, text shorthand, or simply unwanted/junk/typo punctuation? Seriously non-trivial.
After you handle all those negative cases, you could arbitrarily say that any isolated period followed by whitespace is likely to be an end of sentence. (Ultimately, if you really want to buy extra accuracy, you end up writing your own probabilistic sentence-tokenizer which uses weights, and training it on a specific corpus(e.g. legal texts, broadcast media, StackOverflow, Twitter, forums comments etc.)) Then you have to manually review exemplars and training errors. See Manning and Jurafsky book or Coursera course [a].
Ultimately you get as much correctness as you are prepared to pay for.
All of the above is clearly specific to the English-language/ abbreviations, US number/time/date formats. If you want to make it country- and language-independent, that's a bigger proposition, you'll need corpora, native-speaking people to label and QA it all, etc.
All of the above is still only ASCII, which is practically speaking only 96 characters. Allow the input to be Unicode, and things get harder still (and the training-set necessarily must be either much bigger or much sparser)
In the simple (deterministic) case, function isEndOfSentence(leftContext, rightContext) would return boolean, but in the more general sense, it's probabilistic: it returns a float 0.0-1.0 (confidence level that that particular '.' is a sentence end).
References: [a] Coursera video: "Basic Text Processing 2-5 - Sentence Segmentation - Stanford NLP - Professor Dan Jurafsky & Chris Manning" [UPDATE: an unofficial version used to be on YouTube, was taken down]
Try to split the input according to the spaces rather than a dot or ?, if you do like this then the dot or ? won't be printed in the final result.
>>> import re
>>> s = """Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it. Did he mind? Adam Jones Jr. thinks he didn't. In any case, this isn't true... Well, with a probability of .9 it isn't."""
>>> m = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', s)
>>> for i in m:
... print i
...
Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it.
Did he mind?
Adam Jones Jr. thinks he didn't.
In any case, this isn't true...
Well, with a probability of .9 it isn't.
sent = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)(\s|[A-Z].*)',text)
for s in sent:
print s
Here the regex used is : (?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)(\s|[A-Z].*)
First block: (?<!\w\.\w.) : this pattern searches in a negative feedback loop (?<!) for all words (\w) followed by fullstop (\.) , followed by other words (\.)
Second block: (?<![A-Z][a-z]\.): this pattern searches in a negative feedback loop for anything starting with uppercase alphabets ([A-Z]), followed by lower case alphabets ([a-z]) till a dot (\.) is found.
Third block: (?<=\.|\?): this pattern searches in a feedback loop of dot (\.) OR question mark (\?)
Fourth block: (\s|[A-Z].*): this pattern searches after the dot OR question mark from the third block. It searches for blank space (\s) OR any sequence of characters starting with a upper case alphabet ([A-Z].*).
This block is important to split if the input is as
Hello world.Hi I am here today.
i.e. if there is space or no space after the dot.
Naive approach for proper english sentences not starting with non-alphas and not containing quoted parts of speech:
import re
text = """\
Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it. Did he mind? Adam Jones Jr. thinks he didn't. In any case, this isn't true... Well, with a probability of .9 it isn't.
"""
EndPunctuation = re.compile(r'([\.\?\!]\s+)')
NonEndings = re.compile(r'(?:Mrs?|Jr|i\.e)\.\s*$')
parts = EndPunctuation.split(text)
sentence = []
for part in parts:
if len(part) and len(sentence) and EndPunctuation.match(sentence[-1]) and not NonEndings.search(''.join(sentence)):
print(''.join(sentence))
sentence = []
if len(part):
sentence.append(part)
if len(sentence):
print(''.join(sentence))
False positive splitting may be reduced by extending NonEndings a bit. Other cases will require additional code. Handling typos in a sensible way will prove difficult with this approach.
You will never reach perfection with this approach. But depending on the task it might just work "enough"...
I'm not great at regular expressions, but a simpler version, "brute force" actually, of above is
sentence = re.compile("([\'\"][A-Z]|([A-Z][a-z]*\. )|[A-Z])(([a-z]*\.[a-z]*\.)|([A-Za-z0-9]*\.[A-Za-z0-9])|([A-Z][a-z]*\. [A-Za-z]*)|[^\.?]|[A-Za-z])*[\.?]")
which means
start acceptable units are '[A-Z] or "[A-Z]
please note, most regular expressions are greedy so the order is very important when we do |(or). That's, why I have written i.e. regular expression first, then is come forms like Inc.
Try this:
(?<!\b(?:[A-Z][a-z]|\d|[i.e]))\.(?!\b(?:com|\d+)\b)
I wrote this taking into consideration smci's comments above. It is a middle-of-the-road approach that doesn't require external libraries and doesn't use regex. It allows you to provide a list of abbreviations and accounts for sentences ended by terminators in wrappers, such as a period and quote: [.", ?', .)].
abbreviations = {'dr.': 'doctor', 'mr.': 'mister', 'bro.': 'brother', 'bro': 'brother', 'mrs.': 'mistress', 'ms.': 'miss', 'jr.': 'junior', 'sr.': 'senior', 'i.e.': 'for example', 'e.g.': 'for example', 'vs.': 'versus'}
terminators = ['.', '!', '?']
wrappers = ['"', "'", ')', ']', '}']
def find_sentences(paragraph):
end = True
sentences = []
while end > -1:
end = find_sentence_end(paragraph)
if end > -1:
sentences.append(paragraph[end:].strip())
paragraph = paragraph[:end]
sentences.append(paragraph)
sentences.reverse()
return sentences
def find_sentence_end(paragraph):
[possible_endings, contraction_locations] = [[], []]
contractions = abbreviations.keys()
sentence_terminators = terminators + [terminator + wrapper for wrapper in wrappers for terminator in terminators]
for sentence_terminator in sentence_terminators:
t_indices = list(find_all(paragraph, sentence_terminator))
possible_endings.extend(([] if not len(t_indices) else [[i, len(sentence_terminator)] for i in t_indices]))
for contraction in contractions:
c_indices = list(find_all(paragraph, contraction))
contraction_locations.extend(([] if not len(c_indices) else [i + len(contraction) for i in c_indices]))
possible_endings = [pe for pe in possible_endings if pe[0] + pe[1] not in contraction_locations]
if len(paragraph) in [pe[0] + pe[1] for pe in possible_endings]:
max_end_start = max([pe[0] for pe in possible_endings])
possible_endings = [pe for pe in possible_endings if pe[0] != max_end_start]
possible_endings = [pe[0] + pe[1] for pe in possible_endings if sum(pe) > len(paragraph) or (sum(pe) < len(paragraph) and paragraph[sum(pe)] == ' ')]
end = (-1 if not len(possible_endings) else max(possible_endings))
return end
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1:
return
yield start
start += len(sub)
I used Karl's find_all function from this entry: Find all occurrences of a substring in Python
My example is based on the example of Ali, adapted to Brazilian Portuguese. Thanks Ali.
ABREVIACOES = ['sra?s?', 'exm[ao]s?', 'ns?', 'nos?', 'doc', 'ac', 'publ', 'ex', 'lv', 'vlr?', 'vls?',
'exmo(a)', 'ilmo(a)', 'av', 'of', 'min', 'livr?', 'co?ls?', 'univ', 'resp', 'cli', 'lb',
'dra?s?', '[a-z]+r\(as?\)', 'ed', 'pa?g', 'cod', 'prof', 'op', 'plan', 'edf?', 'func', 'ch',
'arts?', 'artigs?', 'artg', 'pars?', 'rel', 'tel', 'res', '[a-z]', 'vls?', 'gab', 'bel',
'ilm[oa]', 'parc', 'proc', 'adv', 'vols?', 'cels?', 'pp', 'ex[ao]', 'eg', 'pl', 'ref',
'[0-9]+', 'reg', 'f[ilí]s?', 'inc', 'par', 'alin', 'fts', 'publ?', 'ex', 'v. em', 'v.rev']
ABREVIACOES_RGX = re.compile(r'(?:{})\.\s*$'.format('|\s'.join(ABREVIACOES)), re.IGNORECASE)
def sentencas(texto, min_len=5):
# baseado em https://stackoverflow.com/questions/25735644/python-regex-for-splitting-text-into-sentences-sentence-tokenizing
texto = re.sub(r'\s\s+', ' ', texto)
EndPunctuation = re.compile(r'([\.\?\!]\s+)')
# print(NonEndings)
parts = EndPunctuation.split(texto)
sentencas = []
sentence = []
for part in parts:
txt_sent = ''.join(sentence)
q_len = len(txt_sent)
if len(part) and len(sentence) and q_len >= min_len and \
EndPunctuation.match(sentence[-1]) and \
not ABREVIACOES_RGX.search(txt_sent):
sentencas.append(txt_sent)
sentence = []
if len(part):
sentence.append(part)
if sentence:
sentencas.append(''.join(sentence))
return sentencas
Full code in: https://github.com/luizanisio/comparador_elastic
If you want to break up sentences at 3 periods (not sure if this is what you want) you can use this regular expresion:
import re
text = """\
Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it. Did he mind? Adam Jones Jr. thinks he didn't. In any case, this isn't true... Well, with a probability of .9 it isn't.
"""
sentences = re.split(r'\.{3}', text)
for stuff in sentences:
print(stuff)