I would like to use Matcher from Spacy on a list of span (sents)
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
print("(", span, ")")
self.phrase[string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
sents = list(doc.sents)
for sent in sents:
self.phrase = {
"NP": [],
"VP": [],
"VVP": []
}
self.phrases.append(self.phrase)
print("[", sent, "]")
self.matcher(sent)
for phrase in self.phrase.values():
phrase.sort(key=lambda x: x.start)
return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)
phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.\nI love pdf, it is wonderfull.")
print(phrases)
but it seems confused and give me this response
[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]
The first element is good but not the second {'NP': [built], 'VP': [is, of], 'VVP': []}
Is there a problem if we use the matcher several times with different text ?
Instead of using multiple sentence, I check the sentence ID on the callback function, It work but looks a bit gross
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
sents = list(doc.sents)
sent_id = sents.index(span.sent)
print("(", span, ")")
print("Sentence number: ", sent_id)
self.phrases[sent_id][string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
self.matcher(doc)
for phrases in self.phrases:
for phrase in phrases.values():
phrase.sort(key=lambda x: x.start)
return self.phrases
Related
I have written a Code that will search a multiple terms in a Text file which are 'Capex' & Much more in my case
nlp = spacy.load('en_core_web_sm')
pm = PhraseMatcher(nlp.vocab)
tipe = PhraseMatcher(nlp.vocab)
doc = nlp(text)
sents = [sent for sent in doc.sents]
phrases = ['capex', 'capacity expansion', 'Capacity expansion', 'CAPEX', 'Capacity Expansion', 'Capex']
patterns = [nlp(text) for text in phrases]
pm.add('CAPEX ',None,*patterns)
matches = pm(doc)
Then after that when i get where these terms were is a text file , I try to get the sentence where these terms were used. After that i further search for Date , Value & Type of 'CAPEX' further in that Sentence
now the issue that i am facing is that when i do so their will have multiple instances where Type of 'CAPEX' which are "Greenfield etc etc" are used multiple times. Although my code only runs the no.of times the matches of the word 'CAPEX' are. Any solution to align all these into one Dataframe
def findmatch(doc,phrases,name):
p = phrases
pa = [nlp(text) for text in p]
name = PhraseMatcher(nlp.vocab)
name.add('Type',None,*pa)
results = name(doc)
return results
def getext(matches):
for match_id,start,end in matches:
string_id = nlp.vocab.strings[match_id]
span = doc[start:end]
text = span.text
return text
allcapex = pd.DataFrame( columns = ['Type', 'Value', 'Date','business segment','Location', 'source'])
for ind,match in enumerate(matches):
for sent in sents:
if matches[ind][1] < sent.end:
typematches = findmatch(sent,['Greenfield','greenfield', 'brownfield','Brownfield', 'de-bottlenecking', 'De-bottlenecking'],'Type')
valuematches = findmatch(sent,['Crore', 'Cr','crore', 'cr'],'Value')
datematches = findmatch(sent,['2020', '2021','2022', '2023','2024', '2025', 'FY21', 'FY22', 'FY23', 'FY24', 'FY25','FY26'],'Date')
capextype = getext(typematches)
capexvalue = getext(valuematches)
capexdate = getext(datematches)
allcapex.loc[len(allcapex.index)] = [capextype,capexvalue,capexdate,'','',sent]
break
print(allcapex)
I'm trying to write a Python code that does Aspect Based Sentiment Analysis of product reviews using Dependency Parser. I created an example review:
"The Sound Quality is great but the battery life is bad."
The output is : [['soundquality', ['great']], ['batterylife', ['bad']]]
I can properly get the aspect and it's adjective with this sentence but when I change the text to:
"The Sound Quality is not great but the battery life is not bad."
The output still stays the same. How can I add a negation handling to my code? And are there ways to improve what I currently have?
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import stanfordnlp
stanfordnlp.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
txt = "The Sound Quality is not great but the battery life is not bad."
txt = txt.lower()
sentList = nltk.sent_tokenize(txt)
taggedList = []
for line in sentList:
txt_list = nltk.word_tokenize(line) # tokenize sentence
taggedList = taggedList + nltk.pos_tag(txt_list) # perform POS-Tagging
print(taggedList)
newwordList = []
flag = 0
for i in range(0,len(taggedList)-1):
if(taggedList[i][1]=='NN' and taggedList[i+1][1]=='NN'):
newwordList.append(taggedList[i][0]+taggedList[i+1][0])
flag=1
else:
if(flag == 1):
flag=0
continue
newwordList.append(taggedList[i][0])
if(i==len(taggedList)-2):
newwordList.append(taggedList[i+1][0])
finaltxt = ' '.join(word for word in newwordList)
print(finaltxt)
stop_words = set(stopwords.words('english'))
new_txt_list = nltk.word_tokenize(finaltxt)
wordsList = [w for w in new_txt_list if not w in stop_words]
taggedList = nltk.pos_tag(wordsList)
nlp = stanfordnlp.Pipeline()
doc = nlp(finaltxt)
dep_node = []
for dep_edge in doc.sentences[0].dependencies:
dep_node.append([dep_edge[2].text, dep_edge[0].index, dep_edge[1]])
for i in range(0, len(dep_node)):
if(int(dep_node[i][1]) != 0):
dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
print(dep_node)
featureList = []
categories = []
totalfeatureList = []
for i in taggedList:
if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
featureList.append(list(i))
totalfeatureList.append(list(i)) # stores all the features for every sentence
categories.append(i[0])
print(featureList)
print(categories)
fcluster = []
for i in featureList:
filist = []
for j in dep_node:
if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
if(j[0]==i[0]):
filist.append(j[1])
else:
filist.append(j[0])
fcluster.append([i[0], filist])
print(fcluster)
finalcluster = []
dic = {}
for i in featureList:
dic[i[0]] = i[1]
for i in fcluster:
if(dic[i[0]]=='NN'):
finalcluster.append(i)
print(finalcluster)
You may wish to try spacy. The following pattern will catch:
a noun phrase
followed by is or are
optionally followed by not
followed by an adjective
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
output = []
doc = nlp('The product is very good')
matcher = Matcher(nlp.vocab)
matcher.add("mood",None,[{"LOWER":{"IN":["is","are"]}},{"LOWER":{"IN":["no","not"]},"OP":"?"},{"LOWER":"very","OP":"?"},{"POS":"ADJ"}])
for nc in doc.noun_chunks:
d = doc[nc.root.right_edge.i+1:nc.root.right_edge.i+1+3]
matches = matcher(d)
if matches:
_, start, end = matches[0]
output.append((nc.text, d[start+1:end].text))
print(output)
[('The product', 'very good')]
Alternatively, you may broaden matching pattern with info from dependency parser that would add definition of adjectival phrase:
output = []
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("mood",None,[{"LOWER":{"IN":["is","are"]}},{"LOWER":{"IN":["no","not"]},"OP":"?"},{"DEP":"advmod","OP":"?"},{"DEP":"acomp"}])
for nc in doc.noun_chunks:
d = doc[nc.root.right_edge.i+1:nc.root.right_edge.i+1+3]
matches = matcher(d)
if matches:
_, start, end = matches[0]
output.append((nc.text, d[start+1:end].text))
print(output)
[('The product', 'very good')]
I have a function based on nltk.pos_tag that filters out collocations from text for only Adjective (JJ) and Noun (NN) together.
f1=u'this is my random text'
tokens = word_tokenize(f1)
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
print(bigramFreqTable)
def rightTypes(ngram):
first_type = ('JJ')
second_type = ('NN')
tags = nltk.pos_tag(ngram)
if tags[0][1] in first_type and tags[1][1] in second_type:
return True
else:
return False
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]
print(filtered_bi)
I would like to use spacy method instead of nltk.pos_tag. Below is example code from spacy documentation.
import spacy
from spacy.lang.en.examples import sentences
nlp = spacy.load('en_core_web_sm')
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
print(token.text, token.pos_)
I tried different solutions, for example tags=[(X.text, X.tag_) for Y in nlp(ngram).ents for X in Y] but have errors... Could you please help to use spacy instead of nltk?
Using spaCy's Matcher, you can create custom rules you want to match against.
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)
doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
string_id = nlp.vocab.strings[match_id] # Get string representation
span = doc[start:end] # The matched span
print(match_id, string_id, start, end, span.text)
You can use a pattern such as:
[{"POS": "JJ"}, {"POS": NN}] to meet your requirements.
I have some sentences that I need to convert to regex code and I was trying to use Pyparsing for it. The sentences are basically search rules, telling us what to search for.
Examples of sentences -
LINE_CONTAINS this is a phrase
-this is an example search rule telling that the line you are searching on should have the phrase this is a phrase
LINE_STARTSWITH However we - this is an example search rule telling that the line you are searching on should start with the phrase However we
The rules can be combined too, like- LINE_CONTAINS phrase one BEFORE {phrase2 AND phrase3} AND LINE_STARTSWITH However we
Now, I am trying to parse these sentences and then convert them to regex code. All lines start with either of the 2 symbols mentioned above (call them line_directives). I want to be able to consider these line_directives, and parse them appropriately and do the same for the phrase that follow them, albeit differently parsed. Using help from Paul McGuire(here)and my own inputs, I have the following code-
from pyparsing import *
import re
UPTO, AND, OR, WORDS = map(Literal, "upto AND OR words".split())
keyword = UPTO | WORDS | AND | OR
LBRACE,RBRACE = map(Suppress, "{}")
integer = pyparsing_common.integer()
LINE_CONTAINS, LINE_STARTSWITH, LINE_ENDSWITH = map(Literal,
"""LINE_CONTAINS LINE_STARTSWITH LINE_ENDSWITH""".split()) # put option for LINE_ENDSWITH. Users may use, I don't presently
BEFORE, AFTER, JOIN = map(Literal, "BEFORE AFTER JOIN".split())
word = ~keyword + Word(alphas)
phrase = Group(OneOrMore(word))
upto_expr = Group(LBRACE + UPTO + integer("numberofwords") + WORDS + RBRACE)
class Node(object):
def __init__(self, tokens):
self.tokens = tokens
def generate(self):
pass
class LiteralNode(Node):
def generate(self):
print (self.tokens[0], 20)
for el in self.tokens[0]:
print (el,type(el), 19)
print (type(self.tokens[0]), 18)
return "(%s)" %(' '.join(self.tokens[0])) # here, merged the elements, so that re.escape does not have to do an escape for the entire list
def __repr__(self):
return repr(self.tokens[0])
class AndNode(Node):
def generate(self):
tokens = self.tokens[0]
return '.*'.join(t.generate() for t in tokens[::2]) # change this to the correct form of AND in regex
def __repr__(self):
return ' AND '.join(repr(t) for t in self.tokens[0].asList()[::2])
class OrNode(Node):
def generate(self):
tokens = self.tokens[0]
return '|'.join(t.generate() for t in tokens[::2])
def __repr__(self):
return ' OR '.join(repr(t) for t in self.tokens[0].asList()[::2])
class UpToNode(Node):
def generate(self):
tokens = self.tokens[0]
ret = tokens[0].generate()
print (123123)
word_re = r"\s+\S+"
space_re = r"\s+"
for op, operand in zip(tokens[1::2], tokens[2::2]):
# op contains the parsed "upto" expression
ret += "((%s){0,%d}%s)" % (word_re, op.numberofwords, space_re) + operand.generate()
print ret
return ret
def __repr__(self):
tokens = self.tokens[0]
ret = repr(tokens[0])
for op, operand in zip(tokens[1::2], tokens[2::2]):
# op contains the parsed "upto" expression
ret += " {0-%d WORDS} " % (op.numberofwords) + repr(operand)
return ret
phrase_expr = infixNotation(phrase,
[
((BEFORE | AFTER | JOIN), 2, opAssoc.LEFT,), # (opExpr, numTerms, rightLeftAssoc, parseAction)
(AND, 2, opAssoc.LEFT,),
(OR, 2, opAssoc.LEFT),
],
lpar=Suppress('{'), rpar=Suppress('}')
) # structure of a single phrase with its operators
line_term = Group((LINE_CONTAINS | LINE_STARTSWITH | LINE_ENDSWITH)("line_directive") +
Group(phrase_expr)("phrase")) # basically giving structure to a single sub-rule having line-term and phrase
line_contents_expr = infixNotation(line_term,
[(AND, 2, opAssoc.LEFT,),
(OR, 2, opAssoc.LEFT),
]
) # grammar for the entire rule/sentence
phrase_expr = infixNotation(line_contents_expr.setParseAction(LiteralNode),
[
(upto_expr, 2, opAssoc.LEFT, UpToNode),
(AND, 2, opAssoc.LEFT, AndNode),
(OR, 2, opAssoc.LEFT, OrNode),
])
tests1 = """LINE_CONTAINS overexpressing gene AND other things""".splitlines()
for t in tests1:
t = t.strip()
if not t:
continue
# print(t, 12)
try:
parsed = phrase_expr.parseString(t)
except ParseException as pe:
print(' '*pe.loc + '^')
print(pe)
continue
print (parsed[0], 14)
print (type(parsed[0]))
print(parsed[0].generate(), 15)
This simple code, on running gives the following error-
((['LINE_CONTAINS', ([(['overexpressing', 'gene'], {})], {})],
{'phrase': [(([(['overexpressing', 'gene'], {})], {}), 1)],
'line_directive': [('LINE_CONTAINS', 0)]}), 14)
((['LINE_CONTAINS', ([(['overexpressing', 'gene'], {})], {})],
{'phrase': [(([(['overexpressing', 'gene'], {})], {}), 1)],
'line_directive': [('LINE_CONTAINS', 0)]}), 20)
('LINE_CONTAINS', <, 19)
(([(['overexpressing', 'gene'], {})], {}), , 19)
(, 18)
TypeError: sequence item 1: expected string, ParseResults found (line
29)
(The error code is not completely correct, as angular brackets are not well supported in blockquote here)
So my question is, even though I have written the grammar (using infixnotation) such that it treats LINE_CONTAINS as a line_directive and parse the remaining the line accordingly, why is it not able to parse properly? What is a good way to parse such lines?
Is there any way to implement skip-gram in scikit-learn library?
I have manually generated a list with n-skip-grams, and pass that to skipgrams as vocabulary for the CountVectorizer() method.
Unfortunately, its performance on prediction is very poor: only 63% accuracy.
However, I get an accuracy of 77-80% on CountVectorizer() using ngram_range(min,max)from the default code.
Is there a better way to implement skip-grams in scikit learn?
Here is my part of code:
corpus = GetCorpus() # This one get text from file as a list
vocabulary = list(GetVocabulary(corpus,k,n))
# this one returns a k-skip n-gram
vec = CountVectorizer(
tokenizer=lambda x: x.split(),
ngram_range=(2,2),
stop_words=stopWords,
vocabulary=vocabulary)
To vectorize text with skip-grams in scikit-learn simply passing the skip gram tokens as the vocabulary to CountVectorizer will not work. You need to modify the way tokens are processed which can be done with a custom analyzer. Below is an example vectorizer that produces 1-skip-2-grams,
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer
class SkipGramVectorizer(CountVectorizer):
def build_analyzer(self):
preprocess = self.build_preprocessor()
stop_words = self.get_stop_words()
tokenize = self.build_tokenizer()
return lambda doc: self._word_skip_grams(
compose(tokenize, preprocess, self.decode)(doc),
stop_words)
def _word_skip_grams(self, tokens, stop_words=None):
# handle stop words
if stop_words is not None:
tokens = [w for w in tokens if w not in stop_words]
return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)
For instance, on this Wikipedia example,
text = ['the rain in Spain falls mainly on the plain']
vect = SkipGramVectorizer()
vect.fit(text)
vect.get_feature_names()
this vectorizer would yield the following tokens,
['falls on', 'in falls', 'mainly the', 'on plain',
'rain spain', 'spain mainly', 'the in']
I came up with my own implementation of a skip-gram vectorizer. It is inspired by this post. I also limited skip-grams to not cross sentence boundaries (using nltk.sent_tokenize), to limit the feature space. Here is my code:
import nltk
from itertools import combinations
from toolz import compose
from sklearn.feature_extraction.text import CountVectorizer
class SkipGramVectorizer(CountVectorizer):
def __init__(self, k=1, **kwds):
super(SkipGramVectorizer, self).__init__(**kwds)
self.k=k
def build_sent_analyzer(self, preprocess, stop_words, tokenize):
return lambda sent : self._word_skip_grams(
compose(tokenize, preprocess, self.decode)(sent),
stop_words)
def build_analyzer(self):
preprocess = self.build_preprocessor()
stop_words = self.get_stop_words()
tokenize = self.build_tokenizer()
sent_analyze = self.build_sent_analyzer(preprocess, stop_words, tokenize)
return lambda doc : self._sent_skip_grams(doc, sent_analyze)
def _sent_skip_grams(self, doc, sent_analyze):
skip_grams = []
for sent in nltk.sent_tokenize(doc):
skip_grams.extend(sent_analyze(sent))
return skip_grams
def _word_skip_grams(self, tokens, stop_words=None):
"""Turn tokens into a sequence of n-grams after stop words filtering"""
# handle stop words
if stop_words is not None:
tokens = [w for w in tokens if w not in stop_words]
# handle token n-grams
min_n, max_n = self.ngram_range
k = self.k
if max_n != 1:
original_tokens = tokens
if min_n == 1:
# no need to do any slicing for unigrams
# just iterate through the original tokens
tokens = list(original_tokens)
min_n += 1
else:
tokens = []
n_original_tokens = len(original_tokens)
# bind method outside of loop to reduce overhead
tokens_append = tokens.append
space_join = " ".join
for n in xrange(min_n,
min(max_n + 1, n_original_tokens + 1)):
for i in xrange(n_original_tokens - n + 1):
# k-skip-n-grams
head = [original_tokens[i]]
for skip_tail in combinations(original_tokens[i+1:i+n+k], n-1):
tokens_append(space_join(head + list(skip_tail)))
return tokens
def test(text, ngram_range, k):
vectorizer = SkipGramVectorizer(ngram_range=ngram_range, k=k)
vectorizer.fit_transform(text)
print(vectorizer.get_feature_names())
def main():
text = ['Insurgents killed in ongoing fighting.']
# 2-skip-bi-grams
test(text, (2,2), 2)
# 2-skip-tri-grams
test(text, (3,3), 2)
###############################################################################################
if __name__ == '__main__':
main()
This would generate the following feature names:
[u'in fighting', u'in ongoing', u'insurgents in', u'insurgents killed', u'insurgents ongoing', u'killed fighting', u'killed in', u'killed ongoing', u'ongoing fighting']
[u'in ongoing fighting', u'insurgents in fighting', u'insurgents in ongoing', u'insurgents killed fighting', u'insurgents killed in', u'insurgents killed ongoing', u'insurgents ongoing fighting', u'killed in fighting', u'killed in ongoing', u'killed ongoing fighting']
Notice, that I basically took the _word_ngrams function from the VectorizerMixin class and replaced the line
tokens_append(space_join(original_tokens[i: i + n]))
with the following:
head = [original_tokens[i]]
for skip_tail in combinations(original_tokens[i+1:i+n+k], n-1):
tokens_append(space_join(head + list(skip_tail)))