Assign Topic from NNMF Topic Modelling - python

I have a list of text comments that are fed into a non-negative matrix factorization topic modelling program.
import pandas as pd
import numpy as np
# load the data
import csv
with open('C:\\...\\comments.csv', newline='') as f:
reader = csv.reader(f)
next(reader) # skip header
df = [tuple(row) for row in reader]
# set the number of topics
total_topics = 3
# process the data
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
data_text = pd.DataFrame(df,columns=['text'])
# remove stopwords and tokenize the text
custom_stops = ["stopword1", "stopword2", "stopword3"]
data_text['filtered_text'] = data_text['text'].apply(lambda x: remove_stopwords(x.lower()))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: str.split(x))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: [item for item in x if item.lower() not in custom_stops])
CORPUS = pd.DataFrame(data_text['filtered_text'])
# Remove empty strings
CORPUS.dropna(inplace=True)
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
# lemmatize the text
for index,entry in enumerate(CORPUS['filtered_text']):
# Declaring Empty List to store the words that follow the rules for this step
Final_words = []
# Initializing WordNetLemmatizer()
word_Lemmatized = WordNetLemmatizer()
# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
for word, tag in pos_tag(entry):
# Below condition is to check for Stop words and consider only alphabets
if word not in stopwords.words('english') and word.isalpha():
word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
Final_words.append(word_Final)
# The final processed set of words for each iteration will be stored in 'text_final'
CORPUS.loc[index,'text_final'] = str(Final_words)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def build_feature_matrix(documents, feature_type='frequency'):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=1,ngram_range=(1, 1))
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=1,ngram_range=(1, 1))
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1))
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix
# create a feature matrix
vectorizer, tfidf_matrix = build_feature_matrix(CORPUS['text_final'], feature_type='tfidf')
td_matrix = tfidf_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)
from sklearn.decomposition import NMF
nmf = NMF(n_components=total_topics, random_state=42, alpha=.1, l1_ratio=.5)
nmf.fit(tfidf_matrix)
def get_topics_terms_weights(weights, feature_names):
feature_names = np.array(feature_names)
sorted_indices = np.array([list(row[::-1])
for row
in np.argsort(np.abs(weights))])
sorted_weights = np.array([list(wt[index])
for wt, index
in zip(weights,sorted_indices)])
sorted_terms = np.array([list(feature_names[row])
for row
in sorted_indices])
topics = [np.vstack((terms.T,
term_weights.T)).T
for terms, term_weights
in zip(sorted_terms, sorted_weights)]
return topics
def print_topics_udf(topics, total_topics=1,
weight_threshold=0.0001,
display_weights=False,
num_terms=None):
for index in range(total_topics):
topic = topics[index]
topic = [(term, float(wt))
for term, wt in topic]
topic = [(word, round(wt,2))
for word, wt in topic
if abs(wt) >= weight_threshold]
if display_weights:
print( 'Topic #' +str(index+1)+' with weights')
print (topic[:num_terms] if num_terms else topic)
else:
print ('Topic #'+str(index+1)+' without weights')
tw = [term for term, wt in topic]
print (tw[:num_terms] if num_terms else tw)
print()
feature_names = vectorizer.get_feature_names()
weights = nmf.components_
topics = get_topics_terms_weights(weights, feature_names)
# print topics and weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=False)
# print topics with weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=True)
# display the topics
# this takes the top term from each group and assigns it as the topic theme
for index in range(0,total_topics):
print("Topic",index+1,"=",topics[index][0][0])
The example output may be something like:
Topic 1 = problem
Topic 2 = software
Topic 3 = recommendation
How can I assign a specific comment from the file a specific topic? e.g., the comment "My computer has an issue of turning off intermittently" would be mapped to Topic 1 "problem"

The answer is to transform the document term matrix to pull out the factorized document topic matrix:
W = nmf.fit_transform(tfidf_matrix)
where the tfidf matrix = W x H, where W is the document-topic matrix and H is the topic-term matrix. Slide 25 of the link gives a good visualization of this technique:
http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf
Thus, the highest value in W for the respective comment row correlates the assigned topic. I iterated across the rows to assign this topics via
data_text['topic'] = ""
for row in range(len(data_text['text'])):
data_text['topic'][row] = topics[np.argmax(W[row])][0][0]
To extend the example in the question, if the [1] index row value of data_text['text'][1] is "My computer has an issue of turning off intermittently" the W[1][0][0] matrix array may be [0.5412, 0.0201, 0.0]. Since the highest value is in the first column, this sentence should be mapped to the first topic (i.e., 'problem' topic). The text assignment of this topic is assigned to data_text['topic'][1] value via topics[np.argmax(W[row])][0][0]

Related

Use trained model sentiment on the dataframe python

I have a dataframe
0 i only need uxy to hit 20 eod to make up for a...
1 oh this isn’t good
2 lads why is my account covered in more red ink...
3 i'm tempted to drop my last 800 into some stup...
4 the sell offs will continue until moral improves.
I want to apply NLP for each comment to identify which one is positive and which one is negative.
Here is what I have
import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews
from random import shuffle
from nltk import FreqDist
from nltk.corpus import stopwords
import string
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.tokenize import word_tokenize
df = pd.read_csv("/home/yan/PycharmProjects/pythonProject/comments_binary.csv")
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
words = movie_reviews.words(fileid)
pos_reviews.append(words)
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
words = movie_reviews.words(fileid)
neg_reviews.append(words)
stopwords_english = stopwords.words('english')
def bag_of_words(words):
words_clean = []
for word in words:
word = word.lower()
if word not in stopwords_english and word not in string.punctuation:
words_clean.append(word)
words_dictionary = dict([word, True] for word in words_clean)
return words_dictionary
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
pos_reviews_set.append((bag_of_words(words), 'pos'))
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
neg_reviews_set.append((bag_of_words(words), 'neg'))
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
custom_review = "I am pretty sure that TSLA will hit 500 today after open"
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
print (classifier.classify(custom_review_set)) # Output: pos
I am confused how would I apply the whole function for each row with text and create a separate column with pos and neg text that would describe a certain comment.
I tried to create a function
def my_classification(x):
return classifier.classify(x)
df["new_column"] = df["text"].apply(my_classification)
But it says AttributeError: 'str' object has no attribute 'copy'
I would highly appreciate your help

Inefficiency of topic modelling for text clustering

I tried doing text clustering using LDA, but it isn't giving me distinct clusters. Below is my code
#Import libraries
from gensim import corpora, models
import pandas as pd
from gensim.parsing.preprocessing import STOPWORDS
from itertools import chain
#stop words
stoplist = list(STOPWORDS)
new = ['education','certification','certificate','certified']
stoplist.extend(new)
stoplist.sort()
#read data
dat = pd.read_csv('D:\data_800k.csv',encoding='latin').Certi.tolist()
#remove stop words
texts = [[word for word in document.lower().split() if word not in stoplist] for document in dat]
#dictionary
dictionary = corpora.Dictionary(texts)
#corpus
corpus = [dictionary.doc2bow(text) for text in texts]
#train model
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=25, workers=4,minimum_probability=0)
#print topics
lda.print_topics(num_topics=25, num_words=7)
#get corpus
lda_corpus = lda[corpus]
#calculate cutoff score
scores = list(chain(*[[score for topic_id,score in topic] \
for topic in [doc for doc in lda_corpus]]))
#threshold
threshold = sum(scores)/len(scores)
threshold
**0.039999999971137644**
#cluster1
cluster1 = [j for i,j in zip(lda_corpus,dat) if i[0][1] > threshold]
#cluster2
cluster2 = [j for i,j in zip(lda_corpus,dat) if i[1][1] > threshold]
The problem is there are overlapping elements in cluster1, which tend to be present in cluster2 and so on.
I also tried to increase threshold manually to 0.5, however it is giving me the same issue
That is just realistic.
Neither documents or words are usually uniquely assignable to a single cluster.
If you'd manually label some data, you will also quickly find some documents that cannot be clearly labeled as one or the other. So it's good I'd the algorithm doesn't pretend there were a good unique assignment.

CSV file with label

As suggested here Python Tf idf algorithm I use this code to get the frequency of words over a set of documents.
import pandas as pd
import csv
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import codecs
def tokenize(text):
tokens = word_tokenize(text)
stems = []
for item in tokens: stems.append(PorterStemmer().stem(item))
return stems
with codecs.open("book1.txt",'r','utf-8') as i1,\
codecs.open("book2.txt",'r','utf-8') as i2,\
codecs.open("book3.txt",'r','utf-8') as i3:
# your corpus
t1=i1.read().replace('\n',' ')
t2=i2.read().replace('\n',' ')
t3=i3.read().replace('\n',' ')
text = [t1,t2,t3]
# word tokenize and stem
text = [" ".join(tokenize(txt.lower())) for txt in text]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text).todense()
# transform the matrix to a pandas df
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
# sum over each document (axis=0)
top_words = matrix.sum(axis=0).sort_values(ascending=False)
top_words.to_csv('dict.csv', index=True, float_format="%f",encoding="utf-8")
With the last line, I create a csv file where are listed all words and their frequency. Is there a way to put a label to them, to see if a word belong only to the third document, or to all?
My goal is to delete from the csv file all the words that appear only in the 3rd document (book3)
You can use the isin() attribute to filter out your top_words in the third book from the top_ words in the entire corpus.
(For the example below I downloaded three random books from http://www.gutenberg.org/)
import codecs
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# import nltk
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
def tokenize(text):
tokens = word_tokenize(text)
stems = []
for item in tokens: stems.append(PorterStemmer().stem(item))
return stems
with codecs.open("56732-0.txt",'r','utf-8') as i1,\
codecs.open("56734-0.txt",'r','utf-8') as i2,\
codecs.open("56736-0.txt",'r','utf-8') as i3:
# your corpus
t1=i1.read().replace('\n',' ')
t2=i2.read().replace('\n',' ')
t3=i3.read().replace('\n',' ')
text = [t1,t2,t3]
# word tokenize and stem
text = [" ".join(tokenize(txt.lower())) for txt in text]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text).todense()
# transform the matrix to a pandas df
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
# sum over each document (axis=0)
top_words = matrix.sum(axis=0).sort_values(ascending=False)
# top_words for the 3rd book alone
text = [" ".join(tokenize(t3.lower()))]
matrix = vectorizer.fit_transform(text).todense()
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
top_words3 = matrix.sum(axis=0).sort_values(ascending=False)
# Mask out words in t3
mask = ~top_words.index.isin(top_words3.index)
# Filter those words from top_words
top_words = top_words[mask]
top_words.to_csv('dict.csv', index=True, float_format="%f",encoding="utf-8")

cosine-similarity between consecutive pairs using whole articles in JSON file

I would like to calculate the cosine similarity for the consecutive pairs of articles in a JSON file. So far I manage to do it but.... I just realize that when transforming the tfidf of each article I am not using the terms from all articles available in the file but only those from each pair. Here is the code that I am using which provides the cosine-similarity coefficient of each consecutive pair of articles.
import json
import nltk
with open('SDM_2015.json') as f:
data = [json.loads(line) for line in f]
## Loading the packages needed:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
## Defining our functions to filter the data
# Short for stemming each word (common root)
stemmer = nltk.stem.porter.PorterStemmer()
# Short for removing puctuations etc
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
## First function that creates the tokens
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
## Function that incorporating the first function, converts all words into lower letters and removes puctuations maps (previously specified)
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
## Lastly, a super function is created that contains all the previous ones plus stopwords removal
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
## Calculation one by one of the cosine similatrity
def foo(x, y):
tfidf = vectorizer.fit_transform([x, y])
return ((tfidf * tfidf.T).A)[0,1]
my_funcs = {}
for i in range(len(data) - 1):
x = data[i]['body']
y = data[i+1]['body']
foo.func_name = "cosine_sim%d" % i
my_funcs["cosine_sim%d" % i] = foo
print(foo(x,y))
Any idea of how to develop the cosine-similarity using the whole terms of all articles available in the JSON file rather than only those of each pair?
Kind regards,
Andres
I think, based on our discussion above, you need to change the foo function and everything below. See the code below. Note that I haven't actually run this, since I don't have your data and no sample lines are provided.
## Loading the packages needed:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import json
from sklearn.metrics.pairwise import cosine_similarity
with open('SDM_2015.json') as f:
data = [json.loads(line) for line in f]
## Defining our functions to filter the data
# Short for stemming each word (common root)
stemmer = nltk.stem.porter.PorterStemmer()
# Short for removing puctuations etc
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
## First function that creates the tokens
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
## Function that incorporating the first function, converts all words into lower letters and removes puctuations maps (previously specified)
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
## tfidf
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
tfidf_data = vectorizer.fit_transform(data)
#cosine dists
similarity matrix = cosine_similarity(tfidf_data)

NLTK: Naive Bayes - where/how to add in ngrams?

I am doing a classification task on tweets (3 labels= pos, neg, neutral), for which I'm using Naive Bayes in NLTK. I'd like to add in ngrams (bigrams) as well. I have tried adding them to the code, but I don't seem to get where to fit them right in. At the moment it seems as if I'm "breaking" the code, no matter where I add in the bigrams. Could anybody please help me out, or redirect me to a tutorial?
My code for unigrams follows. If you need any information on how the datasets look, I'd be happy to provide it.
import nltk
import csv
import random
import nltk.classify.util, nltk.metrics
import codecs
import re, math, collections, itertools
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk import bigrams
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords = True)
stopset = set(stopwords.words('english'))
stopset.add('username')
stopset.add('url')
stopset.add('percentage')
stopset.add('number')
stopset.add('at_user')
stopset.add('AT_USER')
stopset.add('URL')
stopset.add('percentagenumber')
inpTweets = []
##with open('sanders.csv', 'r', 'utf-8') as f: #input sanders
## reader = csv.reader(f, delimiter = ';')
## for row in reader:
## inpTweets.append((row))
reader = codecs.open('...sanders.csv', 'r', encoding='utf-8-sig') #input classified tweets
for line in reader:
line = line.rstrip()
row = line.split(';')
inpTweets.append((row))
def processTweet(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
tweet = re.sub('#[^\s]+','AT_USER',tweet)
tweet = re.sub('[\s]+', ' ', tweet)
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
tweet = tweet.strip('\'"')
return tweet
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
def preprocessing(doc):
tokens = tokenizer.tokenize(doc)
bla = []
for x in tokens:
if len(x)>2:
if x not in stopset:
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x)
if val is not None:
x = replaceTwoOrMore(x)
x = processTweet(x)
x = x.strip('\'"?,.')
x = stemmer.stem(x).lower()
bla.append(x)
return bla
xyz = []
for lijn in inpTweets:
xyz.append((preprocessing (lijn[0]),lijn[1]))
random.shuffle(xyz)
featureList = []
k = 0
while k in range (0, len(xyz)):
featureList.extend(xyz[k][0])
k = k + 1
fd = nltk.FreqDist(featureList)
featureList = list(fd.keys())[2000:]
def document_features(doc):
features = {}
document_words = set(doc)
for word in featureList:
features['contains(%s)' % word] = (word in document_words)
return features
featuresets = nltk.classify.util.apply_features(document_features, xyz)
training_set, test_set = featuresets[2000:], featuresets[:2000]
classifier = nltk.NaiveBayesClassifier.train(training_set)
Your code uses the 2000 most common words as the classification features. Just select the bigrams you want to use, and convert them to features in document_features(). A feature like "contains (the dog)" will work just like "contains (dog)".
An interesting approach is using a sequential backoff tagger, which allows you to chain taggers together: in this way you could train a n-gram tagger and a Naive Bayes and chain them togheter.

Categories

Resources