I am trying to extract base word for an entire text however it reacts differently for a same word coming at different locations. Below is the code for reference:
from nltk.stem import PorterStemmer
import spacy
import pandas as pd
ps = PorterStemmer()
# spacy.cli.download("en")
nlp = spacy.load("en_core_web_sm")
words = [ "meeting","eating", "adjustable", "meeting", "eats", "eating", "eat", "ate", "rafting", "better", "good", "best", 'coming', "ability", "steal", "stolen", 'children']
word_joined = " ".join(words)
# print(word_joined)
doc = nlp(word_joined)
words_lemma = []
words_stemm = []
words_stemm = [ps.stem(w) for w in words]
words_lemma = [w.lemma_ for w in doc]
pd.DataFrame(list(zip(words, words_stemm, words_lemma)))
Is it something related to positioning of the word in a sentence from linguistical prospective?
I wanted to use wordnet lemmatizer in python and I have learnt that the default pos tag is NOUN and that it does not output the correct lemma for a verb, unless the pos tag is explicitly specified as VERB.
My question is what is the best shot inorder to perform the above lemmatization accurately?
I did the pos tagging using nltk.pos_tag and I am lost in integrating the tree bank pos tags to wordnet compatible pos tags. Please help
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
tagged = nltk.pos_tag(tokens)
I get the output tags in NN,JJ,VB,RB. How do I change these to wordnet compatible tags?
Also do I have to train nltk.pos_tag() with a tagged corpus or can I use it directly on my data to evaluate?
First of all, you can use nltk.pos_tag() directly without training it.
The function will load a pretrained tagger from a file. You can see the file name
with nltk.tag._POS_TAGGER:
nltk.tag._POS_TAGGER
>>> 'taggers/maxent_treebank_pos_tagger/english.pickle'
As it was trained with the Treebank corpus, it also uses the Treebank tag set.
The following function would map the treebank tags to WordNet part of speech names:
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
You can then use the return value with the lemmatizer:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('going', wordnet.VERB)
>>> 'go'
Check the return value before passing it to the Lemmatizer because an empty string would give a KeyError.
Steps to convert : Document->Sentences->Tokens->POS->Lemmas
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
#example text text = 'What can I say about this place. The staff of these restaurants is nice and the eggplant is not bad'
class Splitter(object):
"""
split the document into sentences and tokenize each sentence
"""
def __init__(self):
self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
self.tokenizer = nltk.tokenize.TreebankWordTokenizer()
def split(self,text):
"""
out : ['What', 'can', 'I', 'say', 'about', 'this', 'place', '.']
"""
# split into single sentence
sentences = self.splitter.tokenize(text)
# tokenization in each sentences
tokens = [self.tokenizer.tokenize(sent) for sent in sentences]
return tokens
class LemmatizationWithPOSTagger(object):
def __init__(self):
pass
def get_wordnet_pos(self,treebank_tag):
"""
return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
# As default pos in lemmatization is Noun
return wordnet.NOUN
def pos_tag(self,tokens):
# find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
pos_tokens = [nltk.pos_tag(token) for token in tokens]
# lemmatization using pos tagg
# convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
return pos_tokens
lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()
#step 1 split document into sentence followed by tokenization
tokens = splitter.split(text)
#step 2 lemmatization using pos tagger
lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
print(lemma_pos_token)
As in the source code of nltk.corpus.reader.wordnet (http://www.nltk.org/_modules/nltk/corpus/reader/wordnet.html)
#{ Part-of-speech constants
ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
#}
POS_LIST = [NOUN, VERB, ADJ, ADV]
You can create a map using the python default dict and take advantage of the fact that for the lemmatizer the default tag is Noun.
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
text = "Another way of achieving this task"
tokens = word_tokenize(text)
lmtzr = WordNetLemmatizer()
for token, tag in pos_tag(tokens):
lemma = lmtzr.lemmatize(token, tag_map[tag[0]])
print(token, "=>", lemma)
#Suzana_K was working. But I there are some case result in KeyError as # Clock Slave mention.
Convert treebank tags to Wordnet tag
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return None # for easy if-statement
Now, we only input pos into lemmatize function only if we have wordnet tag
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tagged = nltk.pos_tag(tokens)
for word, tag in tagged:
wntag = get_wordnet_pos(tag)
if wntag is None:# not supply tag in case of None
lemma = lemmatizer.lemmatize(word)
else:
lemma = lemmatizer.lemmatize(word, pos=wntag)
You can do as following:
import nltk
from nltk.corpus import wordnet
wordnet_map = {
"N": wordnet.NOUN,
"V": wordnet.VERB,
"J": wordnet.ADJ,
"R": wordnet.ADV
}
def pos_tag_wordnet(text):
"""
Create pos_tag with wordnet format
"""
pos_tagged_text = nltk.pos_tag(text)
# map the pos tagging output with wordnet output
pos_tagged_text = [
(word, wordnet_map.get(pos_tag[0])) if pos_tag[0] in wordnet_map.keys()
else (word, wordnet.NOUN)
for (word, pos_tag) in pos_tagged_text
]
return pos_tagged_text
You can do this in one line:
wnpos = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
Then use wnpos(nltk_pos) to get the POS to give to .lemmatize(). In your case, lmtzr.lemmatize(word=tagged[0][0], pos=wnpos(tagged[0][1])).
After searching from internet, I've found this solution: from sentence to "bag of words" derived after splitting, pos_tagging, lemmatizing and cleaning (from punctuation and "stopping words") operations.
Here's my code:
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
punctuation = u",.?!()-_\"\'\\\n\r\t;:+*<>##ยง^$%&|/"
stop_words_eng = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tag_dict = {"J": wn.ADJ,
"N": wn.NOUN,
"V": wn.VERB,
"R": wn.ADV}
def extract_wnpostag_from_postag(tag):
#take the first letter of the tag
#the second parameter is an "optional" in case of missing key in the dictionary
return tag_dict.get(tag[0].upper(), None)
def lemmatize_tupla_word_postag(tupla):
"""
giving a tupla of the form (wordString, posTagString) like ('guitar', 'NN'), return the lemmatized word
"""
tag = extract_wnpostag_from_postag(tupla[1])
return lemmatizer.lemmatize(tupla[0], tag) if tag is not None else tupla[0]
def bag_of_words(sentence, stop_words=None):
if stop_words is None:
stop_words = stop_words_eng
original_words = word_tokenize(sentence)
tagged_words = nltk.pos_tag(original_words) #returns a list of tuples: (word, tagString) like ('And', 'CC')
original_words = None
lemmatized_words = [ lemmatize_tupla_word_postag(ow) for ow in tagged_words ]
tagged_words = None
cleaned_words = [ w for w in lemmatized_words if (w not in punctuation) and (w not in stop_words) ]
lemmatized_words = None
return cleaned_words
sentence = "Two electric guitar rocks players, and also a better bass player, are standing off to two sides reading corpora while walking"
print(sentence, "\n\n bag of words:\n", bag_of_words(sentence) )
I'm working on a project in which I extract tweets from Twitter and run a sentiment analysis on specific keywords to draw conclusions. Unfortunately, I have come to a point where I am stumped. I have a sentiment analysis code:
When I use this: blob = TextBlob(tweet[text]) I get the following error:
Traceback (most recent call last): File
"C:/Users/Michael/python/Sentiment2.py", line 65, in
blob = TextBlob(tweet[text]) NameError: name 'text' is not defined
import json
import re
import operator
from textblob import TextBlob
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import os, sys, codecs
import csv
import sys
from nltk import bigrams
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:#[\w_]+)', # #-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
fname = 'python.json'
with open(fname, 'r') as f:
lis=[]
neg=0.0
n=0.0
net=0.0
pos=0.0
p=0.0
count_all = Counter()
cout=0
for line in f:
try:
tweet = json.loads(line)
except:
continue
# Create a list with all the terms
blob = TextBlob(tweet[text])
cout+=1
lis.append(blob.sentiment.polarity)
#print blob.sentiment.subjectivity
#print (os.listdir(tweet["text"]))
if blob.sentiment.polarity < 0:
sentiment = "negative"
neg+=blob.sentiment.polarity
n+=1
elif blob.sentiment.polarity == 0:
sentiment = "neutral"
net+=1
else:
sentiment = "positive"
pos+=blob.sentiment.polarity
p+=1
# output sentiment
print("Total tweets"),len(lis)
print("Positive"),float(p/cout)*100,"%"
print("Negative"),float(n/cout)*100,"%"
print("Neutral"),float(net/len(lis))*100,"%"
#print lis
# determine if sentiment is positive, negative, or neutral
# output sentiment
#print sentiment
Change this
# Create a list with all the terms
blob = TextBlob(tweet[text])
to
# Create a list with all the terms
blob = TextBlob(tweet['text'])
I'm trying to implement Naive Bayes algorithm for sentiment analysis of News Paper headlines. I'm using TextBlob for this purpose and I'm finding it difficult to remove stop words such as 'a', 'the', 'in' etc. Below is the snippet of my code in python:
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob
test = [
("11 bonded labourers saved from shoe firm", "pos"),
("Scientists greet Abdul Kalam after the successful launch of Agni on May 22, 1989","pos"),
("Heavy Winter Snow Storm Lashes Out In Northeast US", "neg"),
("Apparent Strike On Gaza Tunnels Kills 2 Palestinians", "neg")
]
with open('input.json', 'r') as fp:
cl = NaiveBayesClassifier(fp, format="json")
print(cl.classify("Oil ends year with biggest gain since 2009")) # "pos"
print(cl.classify("25 dead in Baghdad blasts")) # "neg"
You can first load the json and then create list of tuples(text, label) with the replacement.
Demonstration:
Suppose the input.json file is something like this:
[
{"text": "I love this sandwich.", "label": "pos"},
{"text": "This is an amazing place!", "label": "pos"},
{"text": "I do not like this restaurant", "label": "neg"}
]
Then you can use:
from textblob.classifiers import NaiveBayesClassifier
import json
train_list = []
with open('input.json', 'r') as fp:
json_data = json.load(fp)
for line in json_data:
text = line['text']
text = text.replace(" is ", " ") # you can remove multiple stop words
label = line['label']
train_list.append((text, label))
cl = NaiveBayesClassifier(train_list)
from pprint import pprint
pprint(train_list)
output:
[(u'I love this sandwich.', u'pos'),
(u'This an amazing place!', u'pos'),
(u'I do not like this restaurant', u'neg')]
Following is the code to remove stopwords in the text.
Place all the stopwords in the stopwords files, then read the words and store into stop_words variable.
# This function reads a file and returns its contents as an array
def readFileandReturnAnArray(fileName, readMode, isLower):
myArray=[]
with open(fileName, readMode) as readHandle:
for line in readHandle.readlines():
lineRead = line
if isLower:
lineRead = lineRead.lower()
myArray.append(lineRead.strip().lstrip())
readHandle.close()
return myArray
stop_words = readFileandReturnAnArray("stopwords","r",True)
def removeItemsInTweetContainedInAList(tweet_text,stop_words,splitBy):
wordsArray = tweet_text.split(splitBy)
StopWords = list(set(wordsArray).intersection(set(stop_words)))
return_str=""
for word in wordsArray:
if word not in StopWords:
return_str += word + splitBy
return return_str.strip().lstrip()
# Call the above method
tweet_text = removeItemsInTweetContainedInAList(tweet_text.strip().lstrip(),stop_words, " ")
I have written some code to find the term frequency and document frequency of words that contained in file stored at location path. Each file is go through the function cleanDoc() to get the words from text files and I want to file the term frequency in the tabled manner means so that all words from all documents should be considered to find count. Can anybody tell how should I implement it? I am only using NLTK.
import collections
import os.path
import glob
import nltk
wdict = set()
path = "C://Python27//Corpus Files//*.*"
#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
final = [stemmer.stem(word) for word in clean]
return final
for text in glob.glob(path):
f = open(text)
data= f.read()
words = cleanDoc(data)
wdict.update(words)
You can use the FreqDist object, from nltk.probability to count these words. Later, you can navigate in it using a dict-like key-value interface and methods (like freq.items() or freq['word']), or you can even plot the results using matplotlib.
import collections
import os.path
import glob
import nltk
from nltk.probability import FreqDist
term_frequency = {}
path = "C://Python27//Corpus Files//*.*"
#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
final = [stemmer.stem(word) for word in clean]
return final
for text in glob.glob(path):
f = open(text)
data = f.read()
words = cleanDoc(data)
numbers_of_words = len(words)
freq = FreqDist(all_words)
# term_frequency is a dict which structure is like:
# {
# 'path_to_file':
# {'term': 13.4, 'another_term': 15},
# 'another_file':
# {'term2': 12, 'foo': 15}
# }
for term in freq.keys():
if isintance(term_frequency[text], dict):
term_frequency[text][term] = freq[term]/numbers_of_words
else:
term_frequency[text] = {term: freq[term]/numbers_of_words}
Reference: https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.FreqDist-class.html