Align Multiple Matches in SpaCy nlp into a Pandas Dataframe

Align Multiple Matches in SpaCy nlp into a Pandas Dataframe - python

I have written a Code that will search a multiple terms in a Text file which are 'Capex' & Much more in my case
nlp = spacy.load('en_core_web_sm')
pm = PhraseMatcher(nlp.vocab)
tipe = PhraseMatcher(nlp.vocab)
doc = nlp(text)
sents = [sent for sent in doc.sents]
phrases = ['capex', 'capacity expansion', 'Capacity expansion', 'CAPEX', 'Capacity Expansion', 'Capex']
patterns = [nlp(text) for text in phrases]
pm.add('CAPEX ',None,*patterns)
matches = pm(doc)
Then after that when i get where these terms were is a text file , I try to get the sentence where these terms were used. After that i further search for Date , Value & Type of 'CAPEX' further in that Sentence
now the issue that i am facing is that when i do so their will have multiple instances where Type of 'CAPEX' which are "Greenfield etc etc" are used multiple times. Although my code only runs the no.of times the matches of the word 'CAPEX' are. Any solution to align all these into one Dataframe
def findmatch(doc,phrases,name):
p = phrases
pa = [nlp(text) for text in p]
name = PhraseMatcher(nlp.vocab)
name.add('Type',None,*pa)
results = name(doc)
return results
def getext(matches):
for match_id,start,end in matches:
string_id = nlp.vocab.strings[match_id]
span = doc[start:end]
text = span.text
return text
allcapex = pd.DataFrame( columns = ['Type', 'Value', 'Date','business segment','Location', 'source'])
for ind,match in enumerate(matches):
for sent in sents:
if matches[ind][1] < sent.end:
typematches = findmatch(sent,['Greenfield','greenfield', 'brownfield','Brownfield', 'de-bottlenecking', 'De-bottlenecking'],'Type')
valuematches = findmatch(sent,['Crore', 'Cr','crore', 'cr'],'Value')
datematches = findmatch(sent,['2020', '2021','2022', '2023','2024', '2025', 'FY21', 'FY22', 'FY23', 'FY24', 'FY25','FY26'],'Date')
capextype = getext(typematches)
capexvalue = getext(valuematches)
capexdate = getext(datematches)
allcapex.loc[len(allcapex.index)] = [capextype,capexvalue,capexdate,'','',sent]
break
print(allcapex)

Related

Using multiple time Matcher in Spacy

I would like to use Matcher from Spacy on a list of span (sents)
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
print("(", span, ")")
self.phrase[string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
sents = list(doc.sents)
for sent in sents:
self.phrase = {
"NP": [],
"VP": [],
"VVP": []
}
self.phrases.append(self.phrase)
print("[", sent, "]")
self.matcher(sent)
for phrase in self.phrase.values():
phrase.sort(key=lambda x: x.start)
return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)
phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.\nI love pdf, it is wonderfull.")
print(phrases)
but it seems confused and give me this response
[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]
The first element is good but not the second {'NP': [built], 'VP': [is, of], 'VVP': []}
Is there a problem if we use the matcher several times with different text ?

Instead of using multiple sentence, I check the sentence ID on the callback function, It work but looks a bit gross
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
sents = list(doc.sents)
sent_id = sents.index(span.sent)
print("(", span, ")")
print("Sentence number: ", sent_id)
self.phrases[sent_id][string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
self.matcher(doc)
for phrases in self.phrases:
for phrase in phrases.values():
phrase.sort(key=lambda x: x.start)
return self.phrases

Extract Only Certain Named Entities From Tokens

Quick question (hopefully). Is it possible for me to get the named entities of the tokens except for the ones with CARDINAL label (The label is 397). Here is my code below:
spacy_model = spacy.load('en-core-web-lg')
f = open('temp.txt')
tokens = spacy_model(f.read())
named_entities = tokens.ents #Except where named_entities.label = 397
Is this possible? Any help would be greatly appreciated.

You can filter out the entities using list comprehension:
named_entities = [t for t in tokens.ents if t.label_ != 'CARDINAL']
Here is a test:
import spacy
nlp = spacy.load("en_core_web_sm")
tokens = nlp('The basket costs $10. I bought 6.')
print([(ent.text, ent.label_) for ent in tokens.ents])
# => [('10', 'MONEY'), ('6', 'CARDINAL')]
print([t for t in tokens.ents if t.label_ != 'CARDINAL'])
# => [10]

Need advice on Negation Handling while doing Aspect Based Sentiment Analysis in Python

I'm trying to write a Python code that does Aspect Based Sentiment Analysis of product reviews using Dependency Parser. I created an example review:
"The Sound Quality is great but the battery life is bad."
The output is : [['soundquality', ['great']], ['batterylife', ['bad']]]
I can properly get the aspect and it's adjective with this sentence but when I change the text to:
"The Sound Quality is not great but the battery life is not bad."
The output still stays the same. How can I add a negation handling to my code? And are there ways to improve what I currently have?
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import stanfordnlp
stanfordnlp.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
txt = "The Sound Quality is not great but the battery life is not bad."
txt = txt.lower()
sentList = nltk.sent_tokenize(txt)
taggedList = []
for line in sentList:
txt_list = nltk.word_tokenize(line) # tokenize sentence
taggedList = taggedList + nltk.pos_tag(txt_list) # perform POS-Tagging
print(taggedList)
newwordList = []
flag = 0
for i in range(0,len(taggedList)-1):
if(taggedList[i][1]=='NN' and taggedList[i+1][1]=='NN'):
newwordList.append(taggedList[i][0]+taggedList[i+1][0])
flag=1
else:
if(flag == 1):
flag=0
continue
newwordList.append(taggedList[i][0])
if(i==len(taggedList)-2):
newwordList.append(taggedList[i+1][0])
finaltxt = ' '.join(word for word in newwordList)
print(finaltxt)
stop_words = set(stopwords.words('english'))
new_txt_list = nltk.word_tokenize(finaltxt)
wordsList = [w for w in new_txt_list if not w in stop_words]
taggedList = nltk.pos_tag(wordsList)
nlp = stanfordnlp.Pipeline()
doc = nlp(finaltxt)
dep_node = []
for dep_edge in doc.sentences[0].dependencies:
dep_node.append([dep_edge[2].text, dep_edge[0].index, dep_edge[1]])
for i in range(0, len(dep_node)):
if(int(dep_node[i][1]) != 0):
dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
print(dep_node)
featureList = []
categories = []
totalfeatureList = []
for i in taggedList:
if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
featureList.append(list(i))
totalfeatureList.append(list(i)) # stores all the features for every sentence
categories.append(i[0])
print(featureList)
print(categories)
fcluster = []
for i in featureList:
filist = []
for j in dep_node:
if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
if(j[0]==i[0]):
filist.append(j[1])
else:
filist.append(j[0])
fcluster.append([i[0], filist])
print(fcluster)
finalcluster = []
dic = {}
for i in featureList:
dic[i[0]] = i[1]
for i in fcluster:
if(dic[i[0]]=='NN'):
finalcluster.append(i)
print(finalcluster)

You may wish to try spacy. The following pattern will catch:
a noun phrase
followed by is or are
optionally followed by not
followed by an adjective
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
output = []
doc = nlp('The product is very good')
matcher = Matcher(nlp.vocab)
matcher.add("mood",None,[{"LOWER":{"IN":["is","are"]}},{"LOWER":{"IN":["no","not"]},"OP":"?"},{"LOWER":"very","OP":"?"},{"POS":"ADJ"}])
for nc in doc.noun_chunks:
d = doc[nc.root.right_edge.i+1:nc.root.right_edge.i+1+3]
matches = matcher(d)
if matches:
_, start, end = matches[0]
output.append((nc.text, d[start+1:end].text))
print(output)
[('The product', 'very good')]
Alternatively, you may broaden matching pattern with info from dependency parser that would add definition of adjectival phrase:
output = []
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("mood",None,[{"LOWER":{"IN":["is","are"]}},{"LOWER":{"IN":["no","not"]},"OP":"?"},{"DEP":"advmod","OP":"?"},{"DEP":"acomp"}])
for nc in doc.noun_chunks:
d = doc[nc.root.right_edge.i+1:nc.root.right_edge.i+1+3]
matches = matcher(d)
if matches:
_, start, end = matches[0]
output.append((nc.text, d[start+1:end].text))
print(output)
[('The product', 'very good')]

spacy instead of nltk for POS tagging

I have a function based on nltk.pos_tag that filters out collocations from text for only Adjective (JJ) and Noun (NN) together.
f1=u'this is my random text'
tokens = word_tokenize(f1)
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
print(bigramFreqTable)
def rightTypes(ngram):
first_type = ('JJ')
second_type = ('NN')
tags = nltk.pos_tag(ngram)
if tags[0][1] in first_type and tags[1][1] in second_type:
return True
else:
return False
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]
print(filtered_bi)
I would like to use spacy method instead of nltk.pos_tag. Below is example code from spacy documentation.
import spacy
from spacy.lang.en.examples import sentences
nlp = spacy.load('en_core_web_sm')
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
print(token.text, token.pos_)
I tried different solutions, for example tags=[(X.text, X.tag_) for Y in nlp(ngram).ents for X in Y] but have errors... Could you please help to use spacy instead of nltk?

Using spaCy's Matcher, you can create custom rules you want to match against.
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)
doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
string_id = nlp.vocab.strings[match_id] # Get string representation
span = doc[start:end] # The matched span
print(match_id, string_id, start, end, span.text)
You can use a pattern such as:
[{"POS": "JJ"}, {"POS": NN}] to meet your requirements.

First and Last name tagged as one token by using Stanford NER with Python

I'm using the Stanford Named Entity Recognizer with Python to find the proper names in the novel "A Hundred years of solitud". There are many of them composed by first and last name e.g. "Aureliano Buendía" or "Santa Sofía de la Piedad". These Tokens are always separated e.g. "Aureliano" "Buendia", because of the tokenizer I am using.
I would like to have them together as a token, so they can be tagged together as "PERSON" with Stanford NER.
The code I wrote:
import nltk
from nltk.tag import StanfordNERTagger
from nltk import word_tokenize
from nltk import FreqDist
sentence1 = open('book1.txt').read()
sentence = sentence1.split()
path_to_model = "C:\Python34\stanford-ner-2015-04-20\classifiers\english.muc.7class.distsim.crf.ser"
path_to_jar = "C:\Python34\stanford-ner-2015-04-20\stanford-ner.jar"
st = StanfordNERTagger(model_filename=path_to_model, path_to_jar=path_to_jar)
taggedSentence = st.tag(sentence)
def findtags (tagged_text,tag_prefix):
cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in taggedSentence
if tag.endswith(tag_prefix))
return dict((tag, cfd[tag].most_common(1000)) for tag in cfd.conditions())
print (findtags('_','PERSON'))
The result looks like this:
{'PERSON': [('Aureliano', 397), ('José', 294), ('Arcadio', 286), ('Buendía', 251), ...
Does anybody have a solution? I would be more than grateful

import nltk
from nltk.tag import StanfordNERTagger
sentence1 = open('book1.txt').read()
sentence = sentence1.split()
path_to_model = "C:\Python34\stanford-ner-2015-04-20\classifiers\english.muc.7class.distsim.crf.ser"
path_to_jar = "C:\Python34\stanford-ner-2015-04-20\stanford-ner.jar"
st = StanfordNERTagger(model_filename=path_to_model, path_to_jar=path_to_jar)
taggedSentence = st.tag(sentence)
test = []
test_dict = {}
for element in range(len(taggedSentence)):
a = ''
if element < len(taggedSentence):
while taggedSentence[element][1] == 'PERSON':
a += taggedSentence[element][0] + ' '
taggedSentence.pop(element)
if len(a) > 1:
test.append(a.strip())
test_dict[data.split('.')[0]] = tuple(test)
print(test_dict)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Align Multiple Matches in SpaCy nlp into a Pandas Dataframe - python

Related

Using multiple time Matcher in Spacy

Extract Only Certain Named Entities From Tokens

Need advice on Negation Handling while doing Aspect Based Sentiment Analysis in Python

spacy instead of nltk for POS tagging

First and Last name tagged as one token by using Stanford NER with Python

Categories

Resources