Inefficiency of topic modelling for text clustering - python

I tried doing text clustering using LDA, but it isn't giving me distinct clusters. Below is my code
#Import libraries
from gensim import corpora, models
import pandas as pd
from gensim.parsing.preprocessing import STOPWORDS
from itertools import chain
#stop words
stoplist = list(STOPWORDS)
new = ['education','certification','certificate','certified']
stoplist.extend(new)
stoplist.sort()
#read data
dat = pd.read_csv('D:\data_800k.csv',encoding='latin').Certi.tolist()
#remove stop words
texts = [[word for word in document.lower().split() if word not in stoplist] for document in dat]
#dictionary
dictionary = corpora.Dictionary(texts)
#corpus
corpus = [dictionary.doc2bow(text) for text in texts]
#train model
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=25, workers=4,minimum_probability=0)
#print topics
lda.print_topics(num_topics=25, num_words=7)
#get corpus
lda_corpus = lda[corpus]
#calculate cutoff score
scores = list(chain(*[[score for topic_id,score in topic] \
for topic in [doc for doc in lda_corpus]]))
#threshold
threshold = sum(scores)/len(scores)
threshold
**0.039999999971137644**
#cluster1
cluster1 = [j for i,j in zip(lda_corpus,dat) if i[0][1] > threshold]
#cluster2
cluster2 = [j for i,j in zip(lda_corpus,dat) if i[1][1] > threshold]
The problem is there are overlapping elements in cluster1, which tend to be present in cluster2 and so on.
I also tried to increase threshold manually to 0.5, however it is giving me the same issue

That is just realistic.
Neither documents or words are usually uniquely assignable to a single cluster.
If you'd manually label some data, you will also quickly find some documents that cannot be clearly labeled as one or the other. So it's good I'd the algorithm doesn't pretend there were a good unique assignment.

Related

Assign Topic from NNMF Topic Modelling

I have a list of text comments that are fed into a non-negative matrix factorization topic modelling program.
import pandas as pd
import numpy as np
# load the data
import csv
with open('C:\\...\\comments.csv', newline='') as f:
reader = csv.reader(f)
next(reader) # skip header
df = [tuple(row) for row in reader]
# set the number of topics
total_topics = 3
# process the data
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
data_text = pd.DataFrame(df,columns=['text'])
# remove stopwords and tokenize the text
custom_stops = ["stopword1", "stopword2", "stopword3"]
data_text['filtered_text'] = data_text['text'].apply(lambda x: remove_stopwords(x.lower()))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: str.split(x))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: [item for item in x if item.lower() not in custom_stops])
CORPUS = pd.DataFrame(data_text['filtered_text'])
# Remove empty strings
CORPUS.dropna(inplace=True)
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
# lemmatize the text
for index,entry in enumerate(CORPUS['filtered_text']):
# Declaring Empty List to store the words that follow the rules for this step
Final_words = []
# Initializing WordNetLemmatizer()
word_Lemmatized = WordNetLemmatizer()
# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
for word, tag in pos_tag(entry):
# Below condition is to check for Stop words and consider only alphabets
if word not in stopwords.words('english') and word.isalpha():
word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
Final_words.append(word_Final)
# The final processed set of words for each iteration will be stored in 'text_final'
CORPUS.loc[index,'text_final'] = str(Final_words)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def build_feature_matrix(documents, feature_type='frequency'):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=1,ngram_range=(1, 1))
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=1,ngram_range=(1, 1))
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1))
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix
# create a feature matrix
vectorizer, tfidf_matrix = build_feature_matrix(CORPUS['text_final'], feature_type='tfidf')
td_matrix = tfidf_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)
from sklearn.decomposition import NMF
nmf = NMF(n_components=total_topics, random_state=42, alpha=.1, l1_ratio=.5)
nmf.fit(tfidf_matrix)
def get_topics_terms_weights(weights, feature_names):
feature_names = np.array(feature_names)
sorted_indices = np.array([list(row[::-1])
for row
in np.argsort(np.abs(weights))])
sorted_weights = np.array([list(wt[index])
for wt, index
in zip(weights,sorted_indices)])
sorted_terms = np.array([list(feature_names[row])
for row
in sorted_indices])
topics = [np.vstack((terms.T,
term_weights.T)).T
for terms, term_weights
in zip(sorted_terms, sorted_weights)]
return topics
def print_topics_udf(topics, total_topics=1,
weight_threshold=0.0001,
display_weights=False,
num_terms=None):
for index in range(total_topics):
topic = topics[index]
topic = [(term, float(wt))
for term, wt in topic]
topic = [(word, round(wt,2))
for word, wt in topic
if abs(wt) >= weight_threshold]
if display_weights:
print( 'Topic #' +str(index+1)+' with weights')
print (topic[:num_terms] if num_terms else topic)
else:
print ('Topic #'+str(index+1)+' without weights')
tw = [term for term, wt in topic]
print (tw[:num_terms] if num_terms else tw)
print()
feature_names = vectorizer.get_feature_names()
weights = nmf.components_
topics = get_topics_terms_weights(weights, feature_names)
# print topics and weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=False)
# print topics with weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=True)
# display the topics
# this takes the top term from each group and assigns it as the topic theme
for index in range(0,total_topics):
print("Topic",index+1,"=",topics[index][0][0])
The example output may be something like:
Topic 1 = problem
Topic 2 = software
Topic 3 = recommendation
How can I assign a specific comment from the file a specific topic? e.g., the comment "My computer has an issue of turning off intermittently" would be mapped to Topic 1 "problem"
The answer is to transform the document term matrix to pull out the factorized document topic matrix:
W = nmf.fit_transform(tfidf_matrix)
where the tfidf matrix = W x H, where W is the document-topic matrix and H is the topic-term matrix. Slide 25 of the link gives a good visualization of this technique:
http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf
Thus, the highest value in W for the respective comment row correlates the assigned topic. I iterated across the rows to assign this topics via
data_text['topic'] = ""
for row in range(len(data_text['text'])):
data_text['topic'][row] = topics[np.argmax(W[row])][0][0]
To extend the example in the question, if the [1] index row value of data_text['text'][1] is "My computer has an issue of turning off intermittently" the W[1][0][0] matrix array may be [0.5412, 0.0201, 0.0]. Since the highest value is in the first column, this sentence should be mapped to the first topic (i.e., 'problem' topic). The text assignment of this topic is assigned to data_text['topic'][1] value via topics[np.argmax(W[row])][0][0]

getting top words from the tf-idf sparse matrix (highest tf-idf value)

I have a list of size 208 (208 arrays of sentences), that looks like:
all_words = [["this is a sentence ... "] , [" another one hello bob this is alice ... "] , ["..."] ...]
I want to get the words with the highest tf-idf values.
I created a tf-idf matrix:
from sklearn.feature_extraction.text import TfidfVectorizer
tokenize = lambda doc: doc.split(" ")
sklearn_tfidf = TfidfVectorizer(norm='l2', tokenizer=tokenize, ngram_range=(1,2))
tfidf_matrix = sklearn_tfidf.fit_transform(all_words)
sentences = sklearn_tfidf.get_feature_names()
dense_tfidf = tfidf_matrix.todense()
Now I don't know how to get the words with the highest tf-idf values.
Each column of the dense_tfidf represents a word/2-words. (the matrix is 208x5481)
When I summed each column, it didn't really help - got the same result of a simple top words (I guess because it's the same as a simple word count).
How can I get the words with the highest tf-idf value? Or how can I normalize it wisely?
Had a similar issue but found this at https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f, just change the X and y inputs based on your dataframe. The code from the blog is below. Sklearn's doc helped me: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for Product, category_id in sorted(category_to_id.items()):
features_chi2 = chi2(features, labels == category_id)
indices = np.argsort(features_chi2[0])
feature_names = np.array(tfidf.get_feature_names())[indices]
unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
print("# '{}':".format(Product))
print(" . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
print(" . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

UnpicklingError was unhandled by user code : invalid load key,'%'

I have referred the website https://radimrehurek.com/gensim/tut2.html. I have come across the error UnpicklingError was unhandled by user code : invalid load key,'%'. How do I clear that error? I had referred the other queries and included the klepto package but still that error persists. I am using anacoanda2. This is the code:-
import logging
import xml.etree.cElementTree
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
import os
import klepto
from gensim import corpora
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
from pprint import pprint # pretty-printer
pprint(texts)
dictionary = corpora.Dictionary(texts)
dictionary.save_as_text('/tmp/deerwester.dict') # store the dictionary, for future reference
print(dictionary)
print(dictionary.token2id)
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.dict', corpus) # store to disk, for later use
for c in corpus:
print(c)
class MyCorpus(object):
def __iter__(self):
for line in open('/datasets/mycorpus.txt'):
# assume there's one document per line, tokens separated by whitespace
yield dictionary.doc2bow(line.lower().split())
corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
print(corpus_memory_friendly)
for vector in corpus_memory_friendly: # load one vector into memory at a time
print(vector)
from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('/datasets/mycorpus.txt'))
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify()
print(dictionary)
# create a toy corpus of 2 documents, as a plain Python list
corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it
corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
corpus = corpora.MmCorpus('/tmp/corpus.mm')
print(corpus)
# one way of printing a corpus: load it entirely into memory
print(list(corpus)) # calling list() will convert any sequence to a plain Python list
# another way of doing it: print one document at a time, making use of the streaming interface
for doc in corpus:
print(doc)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
import gensim
import numpy as np
numpy_matrix = np.random.randint(10, size=[5,2])
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
numpy_matrix_dense = gensim.matutils.corpus2dense(corpus, num_terms=10)
import scipy.sparse
scipy_sparse_matrix = scipy.sparse.random(5,2)
corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)
from gensim import corpora, models, similarities
if (os.path.exists("/tmp/deerwester.dict")):
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm')
print("Used files generated from first tutorial")
else:
print("Please run first tutorial to generate data set")
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
print(doc)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
lsi.print_topics(2)
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
print(doc)
lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('/tmp/model.lsi')
model = models.TfidfModel(corpus, normalize=True)
model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus
lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model
model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
lsi_vec = model[tfidf_vec]
model = models.RpModel(tfidf_corpus, num_topics=500)
model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
model = models.HdpModel(corpus, id2word=dictionary)

NLTK CorpusTerm by Document matrix

I am going to use CountVectorizer with a large corpus which I retrieve from Gutenberg (or any dat set from nltk)
There are ebooks in tis corpus. I want to gather all sentences in those books in the same list. Something like that:
listsentences=["SENTENCE#1" ,"SENTENCE#2" ,"SENTENCE#3" ...]
I am stuck how to create sentence list.
Any help is massively appreciated!
This is how my code looks like:
from nltk.corpus import gutenberg
text=nltk.corpus.gutenberg.fileids()
gutenberg.fileids()
emma=gutenberg.sents()
vectorizer=CountVectorizer(min_df = 1, stop_words = 'english')
dtm= vectorizer.fit_transform(emma)
pd.DataFrame(dtm.toarray(),columns=vectorizer.get_feature_names()).head(10)
vectorizer.get_feature_names()
lsa = TruncatedSVD(3, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

How to predict the topic of a new query using a trained LDA model using gensim?

I have trained a corpus for LDA topic modelling using gensim.
Going through the tutorial on the gensim website (this is not the whole code):
question = 'Changelog generation from Github issues?';
temp = question.lower()
for i in range(len(punctuation_string)):
temp = temp.replace(punctuation_string[i], '')
words = re.findall(r'\w+', temp, flags = re.UNICODE | re.LOCALE)
important_words = []
important_words = filter(lambda x: x not in stoplist, words)
print important_words
dictionary = corpora.Dictionary.load('questions.dict')
ques_vec = []
ques_vec = dictionary.doc2bow(important_words)
print dictionary
print ques_vec
print lda[ques_vec]
This is the output that I get:
['changelog', 'generation', 'github', 'issues']
Dictionary(15791 unique tokens)
[(514, 1), (3625, 1), (3626, 1), (3627, 1)]
[(4, 0.20400000000000032), (11, 0.20400000000000032), (19, 0.20263215848547525), (29, 0.20536784151452539)]
I don't know how the last output is going to help me find the possible topic for the question !!!
Please help!
I have written a function in python that gives the possible topic for a new query:
def getTopicForQuery (question):
temp = question.lower()
for i in range(len(punctuation_string)):
temp = temp.replace(punctuation_string[i], '')
words = re.findall(r'\w+', temp, flags = re.UNICODE | re.LOCALE)
important_words = []
important_words = filter(lambda x: x not in stoplist, words)
dictionary = corpora.Dictionary.load('questions.dict')
ques_vec = []
ques_vec = dictionary.doc2bow(important_words)
topic_vec = []
topic_vec = lda[ques_vec]
word_count_array = numpy.empty((len(topic_vec), 2), dtype = numpy.object)
for i in range(len(topic_vec)):
word_count_array[i, 0] = topic_vec[i][0]
word_count_array[i, 1] = topic_vec[i][1]
idx = numpy.argsort(word_count_array[:, 1])
idx = idx[::-1]
word_count_array = word_count_array[idx]
final = []
final = lda.print_topic(word_count_array[0, 0], 1)
question_topic = final.split('*') ## as format is like "probability * topic"
return question_topic[1]
Before going through this do refer this link!
In the initial part of the code, the query is being pre-processed so that it can be stripped off stop words and unnecessary punctuations.
Then, the dictionary that was made by using our own database is loaded.
We, then, we convert the tokens of the new query to bag of words and then the topic probability distribution of the query is calculated by topic_vec = lda[ques_vec] where lda is the trained model as explained in the link referred above.
The distribution is then sorted w.r.t the probabilities of the topics. The topic with the highest probability is then displayed by question_topic[1].
Assuming we just need topic with highest probability following code snippet may be helpful:
def findTopic(testObj, dictionary):
text_corpus = []
'''
For each query ( document in the test file) , tokenize the
query, create a feature vector just like how it was done while training
and create text_corpus
'''
for query in testObj:
temp_doc = tokenize(query.strip())
current_doc = []
for word in range(len(temp_doc)):
if temp_doc[word][0] not in stoplist and temp_doc[word][1] == 'NN':
current_doc.append(temp_doc[word][0])
text_corpus.append(current_doc)
'''
For each feature vector text, lda[doc_bow] gives the topic
distribution, which can be sorted in descending order to print the
very first topic
'''
for text in text_corpus:
doc_bow = dictionary.doc2bow(text)
print text
topics = sorted(lda[doc_bow],key=lambda x:x[1],reverse=True)
print(topics)
print(topics[0][0])
The tokenize functions removes punctuations/ domain specific characters to filtered and gives the list of tokens. Here dictionary created in training is passed as parameter of the function, but it can also be loaded from a file.
Basically, Anjmesh Pandey suggested a good example code. However the first word with highest probability in a topic may not solely represent the topic because in some cases clustered topics may have a few topics sharing those most commonly happening words with others even at the top of them. Therefore returning an index of a topic would be enough, which most likely to be close to the query.
topic_id = sorted(lda[ques_vec], key=lambda (index, score): -score)
The transformation of ques_vec gives you per topic idea and then you would try to understand what the unlabeled topic is about by checking some words mainly contributing to the topic.
latent_topic_words = map(lambda (score, word):word lda.show_topic(topic_id))
show_topic() method returns a list of tuple sorted by score of each word contributing to the topic in descending order, and we can roughly understand the latent topic by checking those words with their weights.

Categories

Resources