import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
def word_feats(words):
return dict([(word, True) for word in words])
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
But how can I classify a random word that might be in the corpus.
Doesn't work. Does it need some kind of object?
EDIT: Thanks to #unutbu's feedback and some digging here and reading the comments on the original post the following yields 'pos' or 'neg' for this code (this one's a 'pos')
and this yields the evaluation of the word for 'pos' or 'neg'
The classifier.classify method does not operate on individual words per se, it classifies based on a dict of features. In this example, word_feats maps a sentence (a list of words) to a dict of features.
Here is another example (from the NLTK book) which uses the NaiveBayesClassifier. By comparing what is similar and different between that example, and the one you posted, you may get a better perspective of how it can be used.
I have a text which has many sentences. How can I use nltk.ngrams to process it?
This is my code:
sequence = nltk.tokenize.word_tokenize(raw)
bigram = ngrams(sequence,2)
freq_dist = nltk.FreqDist(bigram)
prob_dist = nltk.MLEProbDist(freq_dist)
number_of_bigrams = freq_dist.N()
However, the above code supposes that all sentences are one sequence. But, sentences are separated, and I guess the last word of one sentence is unrelated to the start word of another sentence. How can I create a bigram for such a text? I need also prob_dist and number_of_bigrams which are based on the `freq_dist.
There are similar questions like this What are ngram counts and how to implement using nltk? but they are mostly about a sequence of words.
You can use the new nltk.lm module. Here's an example, first get some data and tokenize it:
import os
import requests
import io #codecs
from nltk import word_tokenize, sent_tokenize
# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
if os.path.isfile('language-never-random.txt'):
with io.open('language-never-random.txt', encoding='utf8') as fin:
text = fin.read()
url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
text = requests.get(url).content.decode('utf8')
with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
# Tokenize the text.
tokenized_text = [list(map(str.lower, word_tokenize(sent)))
for sent in sent_tokenize(text)]
Then the language modelling:
# Preprocess the tokenized text for 3-grams language modelling
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)
model = MLE(n) # Lets train a 3-grams maximum likelihood estimation model.
model.fit(train_data, padded_sents)
To get the counts:
model.counts['language'] # i.e. Count('language')
model.counts[['language']]['is'] # i.e. Count('is'|'language')
model.counts[['language', 'is']]['never'] # i.e. Count('never'|'language is')
To get the probabilities:
model.score('is', 'language'.split()) # P('is'|'language')
model.score('never', 'language is'.split()) # P('never'|'language is')
There's some kinks on the Kaggle platform when loading the notebook but at some point this notebook should give a good overview of the nltk.lm module https://www.kaggle.com/alvations/n-gram-language-model-with-nltk
I am trying to fit a Word2Vec model. According to the documentation for Gensim's Word2Vec we do not need to call model.build_vocabulary before using it.
But yet it is asking for me to do it. I have tried calling this function and it has not worked. I also fitted a Word2Vec model before without needing to call model.build_vocabulary .
Am I doing something wrong? Here is my code:
from gensim.models import Word2Vec
dataset = pd.read_table('genemap_copy.txt',delimiter='\t', lineterminator='\n')
def row_to_sentences(dataframe):
columns = dataframe.columns.values
corpus = []
for index,row in dataframe.iterrows():
if index == 1000:
sentence = ''
for column in columns:
sentence += ' '+str(row[column])
return corpus
corpus = row_to_sentences(dataset)
clean_corpus = [[sentence[0].lower()] for sentence in corpus ]
# model = Word2Vec()
# model.build_vocab(clean_corpus)
model = Word2Vec(clean_corpus, size=100, window=5, min_count=5, workers=4)
Also I am using macOS Sierra.
I think my problem was having the parameter min_count=5 so it was not considering most of my words if they did not appear more than 5 times.
Try with LineSentence:
from gensim.models.word2vec import LineSentence
and then train your corpus with
model = Word2Vec(LineSentence(clean_corpus), size=100, window=5, min_count=5, workers=4)
Is it that you are appending a new list containing a single sentence each time? corpus.append([sentence]). You need to feed Word2Vec a series of sentences, but not necessarily sentences gathered by document. I'm also not clear on what is in your df but have you tokenised the sentences already?
My generator class I've used before for Word2Vec...
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
class MySentences(object):
def __init__(self, docs):
self.corpus = docs
def __iter__(self):
for doc in self.corpus:
doc_sentences = sent_tokenize(doc)
for sent in doc_sentences:
yield simple_preprocess(sent) # yields a tokenized
sentence ['like','this','one','.']
sentences = MySentences(df['text'].tolist())
model = gensim.models.Word2Vec(sentences, min_count=5, workers=8, size=300, sg=1)
I will like to analyze my first deep learning model using Python and in order to do so I have to first split my corpus (8807 articles) into sentences. My corpus is built as follows:
## Libraries to download
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import json
import nltk
import re
import pandas
appended_data = []
#for i in range(20014,2016):
# df0 = pandas.DataFrame([json.loads(l) for l in open('SDM_%d.json' % i)])
# appended_data.append(df0)
for i in range(2005,2016):
if i > 2013:
df0 = pandas.DataFrame([json.loads(l) for l in open('SDM_%d.json' % i)])
df1 = pandas.DataFrame([json.loads(l) for l in open('Scot_%d.json' % i)])
df2 = pandas.DataFrame([json.loads(l) for l in open('APJ_%d.json' % i)])
df3 = pandas.DataFrame([json.loads(l) for l in open('TH500_%d.json' % i)])
df4 = pandas.DataFrame([json.loads(l) for l in open('DRSM_%d.json' % i)])
appended_data = pandas.concat(appended_data)
# doc_set = df1.body
doc_set = appended_data.body
I am trying to use the function Word2Vec.load_word2vec_format from the library gensim.models but I have to first split my corpus (doc_set) into sentences.
from gensim.models import word2vec
model = Word2Vec.load_word2vec_format(doc_set, binary=False)
Any recommendations?
So, Gensim's Word2Vec requires this format for its training input: sentences = [['first', 'sentence'], ['second', 'sentence']].
I assume your documents contain more than one sentence. You should first split by sentences, you can do that with nltk (you might need to download the model first). Then tokenize each sentence and put everything together in a list.
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentenized = doc_set.body.apply(sent_detector.tokenize)
sentences = itertools.chain.from_iterable(sentenized.tolist()) # just to flatten
result = []
for sent in sentences:
result += [nltk.word_tokenize(sent)]
Unfortunately I am not good enough with Pandas to perform all the operations in a "pandastic" way.
Pay a lot of attention to the parameters of Word2Vec picking them right can make a huge difference.
I try to train a corpus with my own documents. My documents are structured in the same way as the original movie_reviews corpus data, so 1K positive text files in folder 'pos' and 1K negative text files in folder 'neg'. Each textfile contains 25 lines of tweets, which are cleaned, as in: urls, usernames, capital letters, punctuation removed.
How can I adjust this code to use my own text data instead of the movie_reviews?
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from collections import defaultdict
import numpy as np
# define the split of % training / % test
SPLIT = 0.8
def word_feats(words):
return dict([(word, True) for word in words])
posids = movie_reviews.fileids('pos')
negids = movie_reviews.fileids('neg')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
cutoff = int(len(posfeats) * SPLIT)
trainfeats = negfeats[:cutoff] + posfeats[:cutoff]
testfeats = negfeats[cutoff:] + posfeats[cutoff:]
print 'Train on %d instances\nTest on %d instances' % (len(trainfeats),len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'Accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
You can login as a root user and change you directory path to this:
In this document you can find already existing movie_reviews corpora loaded using LazyCorpusLoader:
movie_reviews = LazyCorpusLoader(
'movie_reviews', CategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*')
Then try adding some thing similar to this:
My_Movie = LazyCorpusLoader(
'My_Movie', CategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*')
Where My_Movie is the name which you have created for your movie reviews.
Once Everything is done save and exit.
Finally place you corpus in nltk directory where you can find the movie_review corpus.
Try performing this:
from nltk.corpus import My_Movie # Newly created you own corpus
Hope this will work.
I've been using the maxent classifier in python and its failing and I don't understand why.
I'm using the movie reviews corpus.
(total noob)
import nltk.classify.util
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def word_feats(words):
return dict([(word, True) for word in words])
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
classifier = MaxentClassifier.train(trainfeats)
This is the error (I know I'm doing this wrong please link to how Maxent works)
Warning (from warnings module):
File "C:\Python27\lib\site-packages\nltk\classify\maxent.py", line 1334
sum1 = numpy.sum(exp_nf_delta * A, axis=0)
RuntimeWarning: invalid value encountered in multiply
Warning (from warnings module):
File "C:\Python27\lib\site-packages\nltk\classify\maxent.py", line 1335
sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
RuntimeWarning: invalid value encountered in multiply
Warning (from warnings module):
File "C:\Python27\lib\site-packages\nltk\classify\maxent.py", line 1341
deltas -= (ffreq_empirical - sum1) / -sum2
RuntimeWarning: invalid value encountered in divide
I changed and update the code a bit.
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def word_feats(words):
return dict([(word, True) for word in words])
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
#classifier = nltk.MaxentClassifier.train(trainfeats)
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=3)
all_words = nltk.FreqDist(word for word in movie_reviews.words())
top_words = set(all_words.keys()[:300])
def word_feats(words):
return {word:True for word in words if word in top_words}
There's probably a fix for the numpy overflow issue but since this is just a movie review classifier for learning NLTK / text classification (and you probably don't want training to take a long time anyway), I'll provide a simple workaround: you can just restrict the words used in feature sets.
You can find the 300 most commonly used words in all reviews like this (you can obviously make that higher if you want),
all_words = nltk.FreqDist(word for word in movie_reviews.words())
top_words = set(all_words.keys()[:300])
Then all you have to do is cross-reference top_words in your feature extractor for reviews. Also, just as a suggestion, it's more efficient to use dictionary comprehension rather than convert a list of tuples to a dict. So this might look like,
def word_feats(words):
return {word:True for word in words if word in top_words}