Is there any way to implement skip-gram in scikit-learn library?
I have manually generated a list with n-skip-grams, and pass that to skipgrams as vocabulary for the CountVectorizer() method.
Unfortunately, its performance on prediction is very poor: only 63% accuracy.
However, I get an accuracy of 77-80% on CountVectorizer() using ngram_range(min,max)from the default code.
Is there a better way to implement skip-grams in scikit learn?
Here is my part of code:
corpus = GetCorpus() # This one get text from file as a list
vocabulary = list(GetVocabulary(corpus,k,n))
# this one returns a k-skip n-gram
vec = CountVectorizer(
tokenizer=lambda x: x.split(),
ngram_range=(2,2),
stop_words=stopWords,
vocabulary=vocabulary)
To vectorize text with skip-grams in scikit-learn simply passing the skip gram tokens as the vocabulary to CountVectorizer will not work. You need to modify the way tokens are processed which can be done with a custom analyzer. Below is an example vectorizer that produces 1-skip-2-grams,
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer
class SkipGramVectorizer(CountVectorizer):
def build_analyzer(self):
preprocess = self.build_preprocessor()
stop_words = self.get_stop_words()
tokenize = self.build_tokenizer()
return lambda doc: self._word_skip_grams(
compose(tokenize, preprocess, self.decode)(doc),
stop_words)
def _word_skip_grams(self, tokens, stop_words=None):
# handle stop words
if stop_words is not None:
tokens = [w for w in tokens if w not in stop_words]
return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)
For instance, on this Wikipedia example,
text = ['the rain in Spain falls mainly on the plain']
vect = SkipGramVectorizer()
vect.fit(text)
vect.get_feature_names()
this vectorizer would yield the following tokens,
['falls on', 'in falls', 'mainly the', 'on plain',
'rain spain', 'spain mainly', 'the in']
I came up with my own implementation of a skip-gram vectorizer. It is inspired by this post. I also limited skip-grams to not cross sentence boundaries (using nltk.sent_tokenize), to limit the feature space. Here is my code:
import nltk
from itertools import combinations
from toolz import compose
from sklearn.feature_extraction.text import CountVectorizer
class SkipGramVectorizer(CountVectorizer):
def __init__(self, k=1, **kwds):
super(SkipGramVectorizer, self).__init__(**kwds)
self.k=k
def build_sent_analyzer(self, preprocess, stop_words, tokenize):
return lambda sent : self._word_skip_grams(
compose(tokenize, preprocess, self.decode)(sent),
stop_words)
def build_analyzer(self):
preprocess = self.build_preprocessor()
stop_words = self.get_stop_words()
tokenize = self.build_tokenizer()
sent_analyze = self.build_sent_analyzer(preprocess, stop_words, tokenize)
return lambda doc : self._sent_skip_grams(doc, sent_analyze)
def _sent_skip_grams(self, doc, sent_analyze):
skip_grams = []
for sent in nltk.sent_tokenize(doc):
skip_grams.extend(sent_analyze(sent))
return skip_grams
def _word_skip_grams(self, tokens, stop_words=None):
"""Turn tokens into a sequence of n-grams after stop words filtering"""
# handle stop words
if stop_words is not None:
tokens = [w for w in tokens if w not in stop_words]
# handle token n-grams
min_n, max_n = self.ngram_range
k = self.k
if max_n != 1:
original_tokens = tokens
if min_n == 1:
# no need to do any slicing for unigrams
# just iterate through the original tokens
tokens = list(original_tokens)
min_n += 1
else:
tokens = []
n_original_tokens = len(original_tokens)
# bind method outside of loop to reduce overhead
tokens_append = tokens.append
space_join = " ".join
for n in xrange(min_n,
min(max_n + 1, n_original_tokens + 1)):
for i in xrange(n_original_tokens - n + 1):
# k-skip-n-grams
head = [original_tokens[i]]
for skip_tail in combinations(original_tokens[i+1:i+n+k], n-1):
tokens_append(space_join(head + list(skip_tail)))
return tokens
def test(text, ngram_range, k):
vectorizer = SkipGramVectorizer(ngram_range=ngram_range, k=k)
vectorizer.fit_transform(text)
print(vectorizer.get_feature_names())
def main():
text = ['Insurgents killed in ongoing fighting.']
# 2-skip-bi-grams
test(text, (2,2), 2)
# 2-skip-tri-grams
test(text, (3,3), 2)
###############################################################################################
if __name__ == '__main__':
main()
This would generate the following feature names:
[u'in fighting', u'in ongoing', u'insurgents in', u'insurgents killed', u'insurgents ongoing', u'killed fighting', u'killed in', u'killed ongoing', u'ongoing fighting']
[u'in ongoing fighting', u'insurgents in fighting', u'insurgents in ongoing', u'insurgents killed fighting', u'insurgents killed in', u'insurgents killed ongoing', u'insurgents ongoing fighting', u'killed in fighting', u'killed in ongoing', u'killed ongoing fighting']
Notice, that I basically took the _word_ngrams function from the VectorizerMixin class and replaced the line
tokens_append(space_join(original_tokens[i: i + n]))
with the following:
head = [original_tokens[i]]
for skip_tail in combinations(original_tokens[i+1:i+n+k], n-1):
tokens_append(space_join(head + list(skip_tail)))
Related
using cosine similarity I am trying to find the semantic word comparison. I have posted the code below for reference, in the code, I have added the stopwords, which are the words I don't want to be found during the search. I have opened the text file with which I want the reference words (also given below ) to be compared. I am also adding a limit of 3 words to the search means any word less than three characters is to be considered a stop word. while running the code it is giving me the process finished with exit code 0 and I can't get an output from the code. Would really appreciate some help. Thank you in advance.
import math
import re
stopwords = set (["is", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost",
"alonll", "with", "within", "without", "would", "yet", "you", "your",
"yours", "yourself", "yourselves", "the"])
with open("ref.txt", "r") as f:
lines = f.readlines()
def build_frequency_vector(content: str) -> dict[str, int]:
vector = {}
word_seq = re.split("[ ,;.!?]+", content)
for words in word_seq:
if words not in stopwords and len(words) >= 3:
words = words.lower()
if words in vector:
vector[words] = vector[words] + 1
else:
vector[words] = 1
return vector
refWords = ['spain', 'anchovy',
'france', 'internet', 'china', 'mexico', 'fish', 'industry', 'agriculture', 'fishery', 'tuna', 'transport',
'italy', 'web', 'communication', 'labour', 'fish', 'cod']
refWordsDict = {}
for refWord in refWords:
refWordsDict[refWord] = {}
for line in lines:
line = line.lower()
temp = build_frequency_vector(line)
if refWord not in temp:
continue
for word in temp:
if word not in stopwords and len(word) >= 3 and word != refWord:
refWordsDict[refWord][word] = refWordsDict[refWord].get(word, 0) + temp[word]
def product(v1: dict[str, int], v2: dict[str, int]) -> float:
sp = 0.0
for word in v1:
sp += v1[word] * v2.get(word, 0)
return sp
def cosineSimilarity(s1: str, s2: str) -> float :
d1 = build_frequency_vector(word1)
d2 = build_frequency_vector(word2)
return product(d1, d2) / (math.sqrt(product(d1, d1) * product(d2, d2)))
bests = {}
for word1 in refWords:
bestSimilarity = 0
for word2 in refWords:
if word1 != word2:
similarity: float = cosineSimilarity(refWordsDict[word1], refWordsDict[word2])
if similarity > bestSimilarity:
bestSimilarity = similarity
bests[word1] = (word2, bestSimilarity)
for item in bests:
print(item, "->", bests[item])
I am very new to python and not able to find a solution.
For studying purposes, I've tried to implement this "lesson" using python but "without" sckitlearn or something similar.
My attempt code is the follow:
import pandas, math
training_data = [
['A great game','Sports'],
['The election was over','Not sports'],
['Very clean match','Sports'],
['A clean but forgettable game','Sports'],
['It was a close election','Not sports']
]
text_to_predict = 'A very close game'
data_frame = pandas.DataFrame(training_data, columns=['data','label'])
data_frame = data_frame.applymap(lambda s:s.lower() if type(s) == str else s)
text_to_predict = text_to_predict.lower()
labels = data_frame.label.unique()
word_frequency = data_frame.data.str.split(expand=True).stack().value_counts()
unique_words_set = set()
unique_words = data_frame.data.str.split().apply(unique_words_set.update)
total_unique_words = len(unique_words_set)
word_frequency_per_labels = []
for l in labels:
word_frequency_per_label = data_frame[data_frame.label == l].data.str.split(expand=True).stack().value_counts()
for w, f in word_frequency_per_label.iteritems():
word_frequency_per_labels.append([w,f,l])
word_frequency_per_labels_df = pandas.DataFrame(word_frequency_per_labels, columns=['word','frequency','label'])
laplace_smoothing = 1
results = []
for l in labels:
p = []
total_words_in_label = word_frequency_per_labels_df[word_frequency_per_labels_df.label == l].frequency.sum()
for w in text_to_predict.split():
x = (word_frequency_per_labels_df.query('word == #w and label == #l').frequency.to_list()[:1] or [0])[0]
p.append((x + laplace_smoothing) / (total_words_in_label + total_unique_words))
results.append([l,math.prod(p)])
print(results)
result = pandas.DataFrame(results, columns=['labels','posterior']).sort_values('posterior',ascending = False).labels.iloc[0]
print(result)
In the blog lesson their results are:
But my result were:
[['sports', 4.607999999999999e-05], ['not sports', 1.4293831139825827e-05]]
So, what did I do wrong in my python implementation? How can I get the same results?
Thanks in advance
You haven't multiplied by the priors p(Sport) = 3/5 and p(Not Sport) = 2/5. So just updating your answers by these ratios will get you to the correct result. Everything else looks good.
So for example you implement p(a|Sports) x p(very|Sports) x p(close|Sports) x p(game|Sports) in your math.prod(p) calculation but this ignores the term p(Sport). So adding this in (and doing the same for the not sport condition) fixes things.
In code this can be achieved by:
prior = (data_frame.label == l).mean()
results.append([l,prior*math.prod(p)])
the answer by #nick is correct and should be awarded the bounty.
Here an alternative implementation (from scratch, not using pandas) that also supports normalization of probabilities and words not in training set
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, Set
def tokenize(text: str):
return [word.lower() for word in text.split()]
def normalize(result: Dict[str, float]):
total = sum([v for v in result.values()])
for k in result.keys():
result[k] /= total
#dataclass
class Model:
labels: Set[str] = field(default_factory=set)
words: Set[str] = field(default_factory=set)
prob_labels: Dict[str,float] = field(default_factory=lambda: defaultdict(float)) # P(label)
prob_words: Dict[str,Dict[str,float]] = field(default_factory=lambda: defaultdict(lambda: defaultdict(float))) # P(word | label) as prob_words[label][word]
def predict(self, text: str, norm=True) -> Dict[str, float]: # P(label | text) as model.predict(text)[label]
result = {label: self.prob_labels[label] for label in self.labels}
for word in tokenize(text):
for label in self.labels:
if word in self.words:
result[label] *= self.prob_words[label][word]
if norm:
normalize(result)
return result
def train(self, data):
prob_words_denominator = defaultdict(int)
for row in data:
text = row[0]
label = row[1].lower()
self.labels.add(label)
self.prob_labels[label] += 1.0
for word in tokenize(text):
self.words.add(word)
self.prob_words[label][word] += 1.0
prob_words_denominator[label] += 1.0
for label in self.labels:
self.prob_labels[label] /= len(data)
for word in self.words:
self.prob_words[label][word] = (self.prob_words[label][word] + 1.0) / (prob_words_denominator[label] + len(self.words))
training_data = [
['A great game','Sports'],
['The election was over','Not sports'],
['Very clean match','Sports'],
['A clean but forgettable game','Sports'],
['It was a close election','Not sports']
]
text_to_predict = 'A very close game'
model = Model()
model.train(training_data)
print(model.predict(text_to_predict, norm=False))
print(model.predict(text_to_predict))
print(model.predict("none of these words is in training data"))
output:
{'sports': 2.7647999999999997e-05, 'not sports': 5.7175324559303314e-06}
{'sports': 0.8286395560004286, 'not sports': 0.1713604439995714}
{'sports': 0.6, 'not sports': 0.4}
I'm trying to write a Python code that does Aspect Based Sentiment Analysis of product reviews using Dependency Parser. I created an example review:
"The Sound Quality is great but the battery life is bad."
The output is : [['soundquality', ['great']], ['batterylife', ['bad']]]
I can properly get the aspect and it's adjective with this sentence but when I change the text to:
"The Sound Quality is not great but the battery life is not bad."
The output still stays the same. How can I add a negation handling to my code? And are there ways to improve what I currently have?
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import stanfordnlp
stanfordnlp.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
txt = "The Sound Quality is not great but the battery life is not bad."
txt = txt.lower()
sentList = nltk.sent_tokenize(txt)
taggedList = []
for line in sentList:
txt_list = nltk.word_tokenize(line) # tokenize sentence
taggedList = taggedList + nltk.pos_tag(txt_list) # perform POS-Tagging
print(taggedList)
newwordList = []
flag = 0
for i in range(0,len(taggedList)-1):
if(taggedList[i][1]=='NN' and taggedList[i+1][1]=='NN'):
newwordList.append(taggedList[i][0]+taggedList[i+1][0])
flag=1
else:
if(flag == 1):
flag=0
continue
newwordList.append(taggedList[i][0])
if(i==len(taggedList)-2):
newwordList.append(taggedList[i+1][0])
finaltxt = ' '.join(word for word in newwordList)
print(finaltxt)
stop_words = set(stopwords.words('english'))
new_txt_list = nltk.word_tokenize(finaltxt)
wordsList = [w for w in new_txt_list if not w in stop_words]
taggedList = nltk.pos_tag(wordsList)
nlp = stanfordnlp.Pipeline()
doc = nlp(finaltxt)
dep_node = []
for dep_edge in doc.sentences[0].dependencies:
dep_node.append([dep_edge[2].text, dep_edge[0].index, dep_edge[1]])
for i in range(0, len(dep_node)):
if(int(dep_node[i][1]) != 0):
dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]
print(dep_node)
featureList = []
categories = []
totalfeatureList = []
for i in taggedList:
if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
featureList.append(list(i))
totalfeatureList.append(list(i)) # stores all the features for every sentence
categories.append(i[0])
print(featureList)
print(categories)
fcluster = []
for i in featureList:
filist = []
for j in dep_node:
if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
if(j[0]==i[0]):
filist.append(j[1])
else:
filist.append(j[0])
fcluster.append([i[0], filist])
print(fcluster)
finalcluster = []
dic = {}
for i in featureList:
dic[i[0]] = i[1]
for i in fcluster:
if(dic[i[0]]=='NN'):
finalcluster.append(i)
print(finalcluster)
You may wish to try spacy. The following pattern will catch:
a noun phrase
followed by is or are
optionally followed by not
followed by an adjective
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
output = []
doc = nlp('The product is very good')
matcher = Matcher(nlp.vocab)
matcher.add("mood",None,[{"LOWER":{"IN":["is","are"]}},{"LOWER":{"IN":["no","not"]},"OP":"?"},{"LOWER":"very","OP":"?"},{"POS":"ADJ"}])
for nc in doc.noun_chunks:
d = doc[nc.root.right_edge.i+1:nc.root.right_edge.i+1+3]
matches = matcher(d)
if matches:
_, start, end = matches[0]
output.append((nc.text, d[start+1:end].text))
print(output)
[('The product', 'very good')]
Alternatively, you may broaden matching pattern with info from dependency parser that would add definition of adjectival phrase:
output = []
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("mood",None,[{"LOWER":{"IN":["is","are"]}},{"LOWER":{"IN":["no","not"]},"OP":"?"},{"DEP":"advmod","OP":"?"},{"DEP":"acomp"}])
for nc in doc.noun_chunks:
d = doc[nc.root.right_edge.i+1:nc.root.right_edge.i+1+3]
matches = matcher(d)
if matches:
_, start, end = matches[0]
output.append((nc.text, d[start+1:end].text))
print(output)
[('The product', 'very good')]
I'm using the Stanford Named Entity Recognizer with Python to find the proper names in the novel "A Hundred years of solitud". There are many of them composed by first and last name e.g. "Aureliano Buendía" or "Santa Sofía de la Piedad". These Tokens are always separated e.g. "Aureliano" "Buendia", because of the tokenizer I am using.
I would like to have them together as a token, so they can be tagged together as "PERSON" with Stanford NER.
The code I wrote:
import nltk
from nltk.tag import StanfordNERTagger
from nltk import word_tokenize
from nltk import FreqDist
sentence1 = open('book1.txt').read()
sentence = sentence1.split()
path_to_model = "C:\Python34\stanford-ner-2015-04-20\classifiers\english.muc.7class.distsim.crf.ser"
path_to_jar = "C:\Python34\stanford-ner-2015-04-20\stanford-ner.jar"
st = StanfordNERTagger(model_filename=path_to_model, path_to_jar=path_to_jar)
taggedSentence = st.tag(sentence)
def findtags (tagged_text,tag_prefix):
cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in taggedSentence
if tag.endswith(tag_prefix))
return dict((tag, cfd[tag].most_common(1000)) for tag in cfd.conditions())
print (findtags('_','PERSON'))
The result looks like this:
{'PERSON': [('Aureliano', 397), ('José', 294), ('Arcadio', 286), ('Buendía', 251), ...
Does anybody have a solution? I would be more than grateful
import nltk
from nltk.tag import StanfordNERTagger
sentence1 = open('book1.txt').read()
sentence = sentence1.split()
path_to_model = "C:\Python34\stanford-ner-2015-04-20\classifiers\english.muc.7class.distsim.crf.ser"
path_to_jar = "C:\Python34\stanford-ner-2015-04-20\stanford-ner.jar"
st = StanfordNERTagger(model_filename=path_to_model, path_to_jar=path_to_jar)
taggedSentence = st.tag(sentence)
test = []
test_dict = {}
for element in range(len(taggedSentence)):
a = ''
if element < len(taggedSentence):
while taggedSentence[element][1] == 'PERSON':
a += taggedSentence[element][0] + ' '
taggedSentence.pop(element)
if len(a) > 1:
test.append(a.strip())
test_dict[data.split('.')[0]] = tuple(test)
print(test_dict)
I am working with python to take a facebook status, tell what the status is about and the sentiment. Essentially I need to tell what the sentiment refers to, I already have successfully coded a sentiment analyzer so the trouble is getting a POS tagger to compute what the sentiment is referring to.
If you have any suggestions from experience I would be grateful. I've read some papers on computing aboutness from subject-object, NP-PP, and NP-NP relations but haven't seen any good examples and havent found many papers.
Lastly if you have worked with POS-taggers, what would be my best bet in python as a non-computer scientist. I'm a physicist so I can hack code together but don't want to reinvent the wheel if there exists a package that has everything I'm going to need.
Thank you very much in advance!
This is what I found to work, going to edit it and use it with nltk pos tagger and see what results I can get.
import nltk
from nltk.corpus import brown
# http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
# This is our fast Part of Speech tagger
#############################################################################
brown_train = brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'(-|:|;)$', ':'),
(r'\'*$', 'MD'),
(r'(The|the|A|a|An|an)$', 'AT'),
(r'.*able$', 'JJ'),
(r'^[A-Z].*$', 'NNP'),
(r'.*ness$', 'NN'),
(r'.*ly$', 'RB'),
(r'.*s$', 'NNS'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*', 'NN')
])
unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
#############################################################################
# This is our semi-CFG; Extend it according to your own needs
#############################################################################
cfg = {}
cfg["NNP+NNP"] = "NNP"
cfg["NN+NN"] = "NNI"
cfg["NNI+NN"] = "NNI"
cfg["JJ+JJ"] = "JJ"
cfg["JJ+NN"] = "NNI"
#############################################################################
class NPExtractor(object):
def __init__(self, sentence):
self.sentence = sentence
# Split the sentence into singlw words/tokens
def tokenize_sentence(self, sentence):
tokens = nltk.word_tokenize(sentence)
return tokens
# Normalize brown corpus' tags ("NN", "NN-PL", "NNS" > "NN")
def normalize_tags(self, tagged):
n_tagged = []
for t in tagged:
if t[1] == "NP-TL" or t[1] == "NP":
n_tagged.append((t[0], "NNP"))
continue
if t[1].endswith("-TL"):
n_tagged.append((t[0], t[1][:-3]))
continue
if t[1].endswith("S"):
n_tagged.append((t[0], t[1][:-1]))
continue
n_tagged.append((t[0], t[1]))
return n_tagged
# Extract the main topics from the sentence
def extract(self):
tokens = self.tokenize_sentence(self.sentence)
tags = self.normalize_tags(bigram_tagger.tag(tokens))
merge = True
while merge:
merge = False
for x in range(0, len(tags) - 1):
t1 = tags[x]
t2 = tags[x + 1]
key = "%s+%s" % (t1[1], t2[1])
value = cfg.get(key, '')
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = "%s %s" % (t1[0], t2[0])
pos = value
tags.insert(x, (match, pos))
break
matches = []
for t in tags:
if t[1] == "NNP" or t[1] == "NNI":
#if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN":
matches.append(t[0])
return matches
# Main method, just run "python np_extractor.py"
Summary="""
Verizon has not honored this appointment or notified me of the delay in an appropriate manner. It is now 1:20 PM and the only way I found out of a change is that I called their chat line and got a message saying my appointment is for 2 PM. My cell phone message says the original time as stated here.
"""
def main(Topic):
facebookData=[]
readdata=csv.reader(open('fb_data1.csv','r'))
for row in readdata:
facebookData.append(row)
relevant_sentence=[]
for status in facebookData:
summary=status.split('.')
for sentence in summary:
np_extractor = NPExtractor(sentence)
result = np_extractor.extract()
if Topic in result:
relevant_sentence.append(sentence)
print sentence
print "This sentence is about: %s" % ", ".join(result)
return relevant_sentence
if __name__ == '__main__':
result=main('Verizon')
note that it will save only sentences that are relevant to the topic you define. so if I am analyzing statuses about cheese I could use it as the topic, extract all of the sentences on cheese and then run a sentiment analysis on those. Please if you have comments or suggestions on improving this let me know!