I would like to classify comments based on NLP algorithm (tf-idf).
I managed to classify these clusters but I want to visualize them graphically (histogram, scatter plot...)
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import pandas as pd
import string
data = pd.read_excel (r'C:\Users\cra\One\intern\Book2.xlsx')
def word_tokenizer(text):
#tokenizes and stems the text
tokens = word_tokenize(text)
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens if t not in
stopwords.words('english')]
return tokens
#tfidf convert text data to vectors
def cluster_sentences(sentences, nb_of_clusters=5):
tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
stop_words=stopwords.words('english'),#enlever stopwords
max_df=0.95,min_df=0.05,
lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
kmeans = KMeans(n_clusters=nb_of_clusters)
kmeans.fit(tfidf_matrix)
clusters = collections.defaultdict(list)
for i, label in enumerate(kmeans.labels_):
clusters[label].append(i)
return dict(clusters)
if __name__ == "__main__":
sentences = data.Comment
nclusters= 20
clusters = cluster_sentences(sentences, nclusters) #dictionary of
#cluster and the index of the comment in the dataframe
for cluster in range(nclusters):
print ("cluster ",cluster,":")
for i,sentence in enumerate(clusters[cluster]):
print ("\tsentence ",i,": ",sentences[sentence])
result that I got for example :
cluster 6 :
sentence 0 : 26 RIH DP std
sentence 1 : 32 RIH DP std
sentence 2 : 68 RIH Liner with DP std in hole
sentence 3 : 105 RIH DP std
sentence 4 : 118 RIH std no of DP in hole
sentence 5 : 154 RIH DP std
Could you help me please! thank you
You will need to use t-SNE to visualize the clusters - this article on visualizing and clustering US Laws using tf-idf can get you started.
Related
I am trying to clusters text words.
Let suppose I have a list of text
text=["WhatsApp extends 'confusing' update deadline",
"India begins world's biggest Covid vaccine drive",
"Nepali climbers make history with K2 winter summit"]
I implemented TF-IDF on this data
vec = TfidfVectorizer()
feat = vec .fit_transform(text)
After that, I applied Kmeans
kmeans = KMeans(n_clusters=num).fit(feat)
The thing I am confused about is how I get clusters of words such as
cluster 0
WhatsApp, update,biggest
cluster 1
history,biggest ,world's
etc.
You can use the get_feature_names() method from the TfidfVectorizer class with the predictions from KMeans to inspect the words in each cluster.
Here's a minimal example with two clusters and the three sentence provided by you:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
text = ["WhatsApp extends 'confusing' update deadline",
"India begins world's biggest Covid vaccine drive",
"Nepali climbers make history with K2 winter summit"]
vec = TfidfVectorizer()
feat = vec.fit_transform(text)
kmeans = KMeans(2).fit(feat)
pred = kmeans.predict(feat)
for i in range(2):
print(f"Cluster #{i}:")
words = []
for sentence in np.array(text)[pred==i]:
words += [fn for fn in vec.get_feature_names() if fn in sentence]
print(words)
Result:
Cluster #0:
['confusing', 'deadline', 'extends', 'update', 'begins', 'biggest', 'drive', 'vaccine', 'world']
Cluster #1:
['climbers', 'history', 'make', 'summit', 'winter', 'with']
I am using biterm.cbtm library to train a topic model of about 2500 short posts.
When BTM finishes, I get the following 10 topics, along with the topic coherence value as shown in this picture: https://ibb.co/Kqy992H
I am trying to understand what those negative coherence values mean and why they are so low. I read a lot of related research and I couldn't find one paper that explains the range of the coherence value. Also, most of the papers where about LDA coherence value, as BTM is not well documented.
Does anyone know the range/meaning of the coherence value I am getting?
Why is coherence between -76 and -111?
You can see my code below:
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms
import numpy as np
import pyLDAvis
from biterm.cbtm import oBTM
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary # helper functions
import pickle
import pandas as pd
from numpy import array
import numpy as np
import logging
import pyLDAvis.gensim
import json
import warnings
import pickle
import pandas as pd
import re
warnings.filterwarnings('ignore') # To ignore all warnings that arise here to enhance clarity
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
from gensim import corpora, models
from gensim.models import Phrases
import time
def docs_preprocessor(docs):
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
docs[idx] = re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', docs[idx])
docs[idx] = docs[idx].lower() # Convert to lowercase.
if len(docs[idx]) < 50:
continue
docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.
# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isdigit()] for doc in docs]
# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 3] for doc in docs]
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
return docs
colnames = ['post']
with open('cleantext.p', 'rb') as handle:
dict = pickle.load(handle)
dict['text'] = list(filter(None.__ne__, dict['text']))
print("Total posts: " + str(len(dict['text'])))
p_df = pd.DataFrame.from_dict(dict)#, skiprows = lambda x: logic(x))
docs = array(p_df['text'])
print("ALL DOCUMENTS: " + str(len(docs)))
docs = docs_preprocessor(docs)
outfile = open("posts.txt", "w+")
total_docs = 0
for sentence in docs:
if len(sentence) < 3:
continue
else:
total_docs += 1
for word in sentence:
result = ''.join([i for i in word if not i.isdigit()])
outfile.write(result + " ")
outfile.write("\n")
outfile.close()
print("Total docs: " + str(total_docs))
print("Reading sentences. . .")
texts = open('posts.txt', 'r').read().splitlines()
clear_text = ""
for item in texts:
clear_text = clear_text + " " + item
vec = CountVectorizer(stop_words='english')
print("Building Vectors. . .")
X = vec.fit_transform(texts).toarray()
print("Building Vocabulary. . .")
vocab = np.array(vec.get_feature_names())
biterms = vec_to_biterms(X)
print("BTM modelling. . .")
btm = oBTM(num_topics=10, V=vocab)
print("\n\n Train Online BTM ..")
btm.fit(biterms, iterations=100)
topics = btm.transform(biterms)
print("\n\n Topic coherence ..")
topic_summuary(btm.phi_wz.T, X, vocab, 10)
#I am getting a weird error about pyLDAvis here. Why?
print("\n\n Visualize Topics ..")
vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
pyLDAvis.save_html(vis, 'btm.html')
I guess we can refer to the idea in coherence understanding of LDA since the formula should be the same.:)
You may take a look at the interpretation here: :)
Negative Values: Evaluate Gensim LDA with Topic Coherence
I have a dataset contains a set of article papers. I merged the metadata and the json files, and created a dataframe. Here is my code:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(merged_df['Title'][39100])
print(X.shape)
query = "How to prevent covid19"
query_vec = vectorize.transform([query])
result = cosine_similarity(X,query_vec).reshape((-1,))
for i in result.argsort()[-10:][::-1]:
print(merged_df.iloc['Title'][i,0], "--", merged_df.iloc['Title'][i,1])
I want to calculate Title's TFIDF to handle the query, that helps me to find some relevant papers.
Why it prompts name "merged_df" is not defined?
Within your code merged_df is nowhere defined. The dataframe is never created, therefore undefined.
I'm trying to create a text classifier to determine whether an abstract indicates an access to care research project. I am importing from a dataset that has two fields: Abstract and Accessclass. Abstract is a 500 word description about the project and Accessclass is 0 for not access-related and 1 for access-related. I'm still in the developing stages, however when I looked at the unigrams and bigrams for 0 and 1 labels, they were the same, despite very distinctly different tones of text. Is there something I'm missing in my code? For example, am I accidentally double adding negative or positive? Any help is appreciate.
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
df = pd.read_excel("accessclasses.xlsx")
df.head()
from io import StringIO
col = ['accessclass', 'abstract']
df = df[col]
df = df[pd.notnull(df['abstract'])]
df.columns = ['accessclass', 'abstract']
df['category_id'] = df['accessclass'].factorize()[0]
category_id_df = df[['accessclass', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'accessclass']].values)
df.head()
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=4, norm='l2', encoding='latin-1', ngram_range=(1,
2), stop_words='english')
features = tfidf.fit_transform(df.abstract).toarray()
labels = df.category_id
print(features.shape)
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for accessclass, category_id in sorted(category_to_id.items()):
features_chi2 = chi2(features, labels == category_id)
indices = np.argsort(features_chi2[0])
feature_names = np.array(tfidf.get_feature_names())[indices]
unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
print("# '{}':".format(accessclass))
print(" . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
print(" . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
I think the problem in your code is setting min_df with a big number like 4 on this small dataset. According to your data that you have posted, the most common words are stopwords that will be removed after using TfidfVectorizer. Here they are:
to : 19
and : 11
a : 6
the : 6
are : 6
of : 6
for : 5
is : 4
in : 4
will : 4
access : 4
I : 4
times : 4
healthcare : 3
more : 3
have : 3
with : 3
...
And these are the unigram... the bigram count will be way lower.
You can solve that by either one of these two options:
Setting the stopwords argument to None like so stopwords=None
Setting min_df to be lower than 4 like 1 or 2 for example.
I recommend using the second option as the first will return stopwords as correlated which isn't helpful at all. I have tried using min_df=1 and here is the result:
. Most correlated unigrams:
. times
. access
. Most correlated bigrams:
. enjoyed watching
. wait times
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from gensim import corpora, models
import gensim
import os
from os import path
from time import sleep
import matplotlib.pyplot as plt
import random
from wordcloud import WordCloud, STOPWORDS
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(get_stop_words('en'))
with open(os.path.join('c:\users\kaila\jobdescription.txt')) as f:
Reader = f.read()
Reader = Reader.replace("will", " ")
Reader = Reader.replace("please", " ")
texts = unicode(Reader, errors='replace')
tdm = []
raw = texts.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
tdm.append(stopped_tokens)
dictionary = corpora.Dictionary(tdm)
corpus = [dictionary.doc2bow(i) for i in tdm]
sleep(3)
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=8, id2word = dictionary)
topics = ldamodel.print_topics(num_topics=8, num_words=200)
for i in topics:
print(i)
wordcloud = WordCloud().generate(i)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
The issue is with the word cloud. I cannot get the word cloud for each of the 8 topics. I would want an output which gives 8 word clouds for the 8 topics.
If anyone can help me regarding this issue, it will be great.
Assuming you have trained a gensim lda model you can simply create a word cloud with the following code
# lda is assumed to be the variable holding the LdaModel object
import matplotlib.pyplot as plt
for t in range(lda.num_topics):
plt.figure()
plt.imshow(WordCloud().fit_words(lda.show_topic(t, 200)))
plt.axis("off")
plt.title("Topic #" + str(t))
plt.show()
I will highlight a few mistakes on your code so you can better follow what I have written above.
WordCloud().generate(something) expects something to be raw text. It will tokenize it, lowercase it and remove stop words and then compute the word cloud. You need the word sizes to match their probability in a topic (I assume).
lda.print_topics(8, 200) returns a textual representation of the topics as in prob1*"token1" + prob2*"token2" + ... you need the lda.show_topic(topic, num_words) to get the word with the corresponding probability as tuples. Then you need WordCloud().fit_words() to generate the word cloud.
The following code is your code with the above visualization. I would also like to point out that you are inferring topics from a single document which is very uncommon and probably not what you wanted.
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from gensim import corpora, models
import gensim
import os
from os import path
from time import sleep
import matplotlib.pyplot as plt
import random
from wordcloud import WordCloud, STOPWORDS
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(get_stop_words('en'))
with open(os.path.join('c:\users\kaila\jobdescription.txt')) as f:
Reader = f.read()
Reader = Reader.replace("will", " ")
Reader = Reader.replace("please", " ")
texts = unicode(Reader, errors='replace')
tdm = []
raw = texts.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
tdm.append(stopped_tokens)
dictionary = corpora.Dictionary(tdm)
corpus = [dictionary.doc2bow(i) for i in tdm]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=8, id2word = dictionary)
for t in range(ldamodel.num_topics):
plt.figure()
plt.imshow(WordCloud().fit_words(ldamodel.show_topic(t, 200)))
plt.axis("off")
plt.title("Topic #" + str(t))
plt.show()
Although from a different library you can see topic visualizations with corresponding code for what the result will be (Disclaimer: I am on of the authors of that library).
The following worked for me:
First, create a lda model and define clusters/topics as discussed in Topic Clustering - Make sure the minimum_probability is 0.
Next, determine the LDA corpus using lda_corpus = lda[corpus]
Now identify the documents from the data belonging to each Topic as a list, below example has two topics. df is my raw data that has a column texts
cluster1 = [j for i,j in zip(lda_corpus,df.texts) if i[0][1] > .2]
cluster2 = [j for i,j in zip(lda_corpus,df.texts) if i[1][1] > .2]
Obtain the Word Cloud for each cluster. You can include as many stop words as we can. Make sure to clean the data in the cluster like remove stopwords, stemming etc. I am skipping those steps, so that each cluster will have cleaned texts/documents.
wordcloud = WordCloud(relative_scaling = 1.0, stopwords=("xxx", 'yyy').generate(' '. join(cluster1))
Finally Plot word cloud using matplotlib
plt.imshow(wordcloud)