cosine-similarity between consecutive pairs using whole articles in JSON file - python

I would like to calculate the cosine similarity for the consecutive pairs of articles in a JSON file. So far I manage to do it but.... I just realize that when transforming the tfidf of each article I am not using the terms from all articles available in the file but only those from each pair. Here is the code that I am using which provides the cosine-similarity coefficient of each consecutive pair of articles.
import json
import nltk
with open('SDM_2015.json') as f:
data = [json.loads(line) for line in f]
## Loading the packages needed:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
## Defining our functions to filter the data
# Short for stemming each word (common root)
stemmer = nltk.stem.porter.PorterStemmer()
# Short for removing puctuations etc
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
## First function that creates the tokens
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
## Function that incorporating the first function, converts all words into lower letters and removes puctuations maps (previously specified)
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
## Lastly, a super function is created that contains all the previous ones plus stopwords removal
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
## Calculation one by one of the cosine similatrity
def foo(x, y):
tfidf = vectorizer.fit_transform([x, y])
return ((tfidf * tfidf.T).A)[0,1]
my_funcs = {}
for i in range(len(data) - 1):
x = data[i]['body']
y = data[i+1]['body']
foo.func_name = "cosine_sim%d" % i
my_funcs["cosine_sim%d" % i] = foo
print(foo(x,y))
Any idea of how to develop the cosine-similarity using the whole terms of all articles available in the JSON file rather than only those of each pair?
Kind regards,
Andres

I think, based on our discussion above, you need to change the foo function and everything below. See the code below. Note that I haven't actually run this, since I don't have your data and no sample lines are provided.
## Loading the packages needed:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import json
from sklearn.metrics.pairwise import cosine_similarity
with open('SDM_2015.json') as f:
data = [json.loads(line) for line in f]
## Defining our functions to filter the data
# Short for stemming each word (common root)
stemmer = nltk.stem.porter.PorterStemmer()
# Short for removing puctuations etc
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
## First function that creates the tokens
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
## Function that incorporating the first function, converts all words into lower letters and removes puctuations maps (previously specified)
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
## tfidf
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
tfidf_data = vectorizer.fit_transform(data)
#cosine dists
similarity matrix = cosine_similarity(tfidf_data)

Related

TFIDFVectorizer making concatenated word tokens

I am using the Cranfield Dataset to make an Indexer and Query Processor. For that purpose I am using TFIDFVectorizer to tokenize the data. But after using TFIDFVectorizer when I check the vocabulary,there were lot of tokens formed using a concatenation of two words.
I am using the following code to achieve it:
import re
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
#reading the data
with open('cran.all', 'r') as f:
content_string=""
content = [line.replace('\n','') for line in f]
content = content_string.join(content)
doc=re.split('.I\s[0-9]{1,4}',content)
f.close()
#some data cleaning
doc = [line.replace('.T',' ').replace('.B',' ').replace('.A',' ').replace('.W',' ') for line in doc]
del doc[0]
doc= [ re.sub('[^A-Za-z]+', ' ', lines) for lines in doc]
vectorizer = TfidfVectorizer(analyzer ='word', ngram_range=(1,1), stop_words=text.ENGLISH_STOP_WORDS,lowercase=True)
X = vectorizer.fit_transform(doc)
print(vectorizer.vocabulary_)
I have attached below a few examples I obtain when I print vocabulary:
'freevibration': 7222, 'slendersharp': 15197, 'frequentlyapproximated': 7249, 'notapplicable': 11347, 'rateof': 13727, 'itsvalue': 9443, 'speedflow': 15516, 'movingwith': 11001, 'speedsolution': 15531, 'centerof': 3314, 'hypersoniclow': 8230, 'neice': 11145, 'rutkowski': 14444, 'chann': 3381, 'layerapproximations': 9828, 'probsteinhave': 13353, 'thishypersonic': 17752
When I use with small data, it does not happen. How to prevent this from happening?
This happens because there are two words are commonly used together .It seems that the concatenated words are resulting from the n-gram generation in the TfidfVectorizer. When you set ngram_range=(1,1), the vectorizer only considers single words. However, when you increase the ngram_range, the vectorizer considers n-grams of words
You can use regular expression to avoid this.
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words=text.ENGLISH_STOP_WORDS, lowercase=True)
X = vectorizer.fit_transform(doc)
# Remove n-grams that have two words concatenated
pattern = r'\b\w+\w\b'
vectorizer.vocabulary_ = {key: val for key, val in vectorizer.vocabulary_.items() if re.match(pattern, key)}
pattern \b\w+\w\b matches n-grams that have two words concatenated, such as freevibration The resulting vocabulary_ dictionary will not contain these n-grams
My guess would be that the issue is caused by this line:
content = [line.replace('\n','') for line in f]
When replacing line breaks, the last word of line 1 is concatenated with the first word of line 2. And of course this happens for every line, so you get a lot of these. The solution is super simple: instead of replacing line break with nothing (i.e. just removing them), replace them with a whitespace:
content = [line.replace('\n',' ') for line in f]
---
(note the space between '')

Assign Topic from NNMF Topic Modelling

I have a list of text comments that are fed into a non-negative matrix factorization topic modelling program.
import pandas as pd
import numpy as np
# load the data
import csv
with open('C:\\...\\comments.csv', newline='') as f:
reader = csv.reader(f)
next(reader) # skip header
df = [tuple(row) for row in reader]
# set the number of topics
total_topics = 3
# process the data
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
data_text = pd.DataFrame(df,columns=['text'])
# remove stopwords and tokenize the text
custom_stops = ["stopword1", "stopword2", "stopword3"]
data_text['filtered_text'] = data_text['text'].apply(lambda x: remove_stopwords(x.lower()))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: str.split(x))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: [item for item in x if item.lower() not in custom_stops])
CORPUS = pd.DataFrame(data_text['filtered_text'])
# Remove empty strings
CORPUS.dropna(inplace=True)
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
# lemmatize the text
for index,entry in enumerate(CORPUS['filtered_text']):
# Declaring Empty List to store the words that follow the rules for this step
Final_words = []
# Initializing WordNetLemmatizer()
word_Lemmatized = WordNetLemmatizer()
# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
for word, tag in pos_tag(entry):
# Below condition is to check for Stop words and consider only alphabets
if word not in stopwords.words('english') and word.isalpha():
word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
Final_words.append(word_Final)
# The final processed set of words for each iteration will be stored in 'text_final'
CORPUS.loc[index,'text_final'] = str(Final_words)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def build_feature_matrix(documents, feature_type='frequency'):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=1,ngram_range=(1, 1))
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=1,ngram_range=(1, 1))
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1))
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix
# create a feature matrix
vectorizer, tfidf_matrix = build_feature_matrix(CORPUS['text_final'], feature_type='tfidf')
td_matrix = tfidf_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)
from sklearn.decomposition import NMF
nmf = NMF(n_components=total_topics, random_state=42, alpha=.1, l1_ratio=.5)
nmf.fit(tfidf_matrix)
def get_topics_terms_weights(weights, feature_names):
feature_names = np.array(feature_names)
sorted_indices = np.array([list(row[::-1])
for row
in np.argsort(np.abs(weights))])
sorted_weights = np.array([list(wt[index])
for wt, index
in zip(weights,sorted_indices)])
sorted_terms = np.array([list(feature_names[row])
for row
in sorted_indices])
topics = [np.vstack((terms.T,
term_weights.T)).T
for terms, term_weights
in zip(sorted_terms, sorted_weights)]
return topics
def print_topics_udf(topics, total_topics=1,
weight_threshold=0.0001,
display_weights=False,
num_terms=None):
for index in range(total_topics):
topic = topics[index]
topic = [(term, float(wt))
for term, wt in topic]
topic = [(word, round(wt,2))
for word, wt in topic
if abs(wt) >= weight_threshold]
if display_weights:
print( 'Topic #' +str(index+1)+' with weights')
print (topic[:num_terms] if num_terms else topic)
else:
print ('Topic #'+str(index+1)+' without weights')
tw = [term for term, wt in topic]
print (tw[:num_terms] if num_terms else tw)
print()
feature_names = vectorizer.get_feature_names()
weights = nmf.components_
topics = get_topics_terms_weights(weights, feature_names)
# print topics and weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=False)
# print topics with weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=True)
# display the topics
# this takes the top term from each group and assigns it as the topic theme
for index in range(0,total_topics):
print("Topic",index+1,"=",topics[index][0][0])
The example output may be something like:
Topic 1 = problem
Topic 2 = software
Topic 3 = recommendation
How can I assign a specific comment from the file a specific topic? e.g., the comment "My computer has an issue of turning off intermittently" would be mapped to Topic 1 "problem"
The answer is to transform the document term matrix to pull out the factorized document topic matrix:
W = nmf.fit_transform(tfidf_matrix)
where the tfidf matrix = W x H, where W is the document-topic matrix and H is the topic-term matrix. Slide 25 of the link gives a good visualization of this technique:
http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf
Thus, the highest value in W for the respective comment row correlates the assigned topic. I iterated across the rows to assign this topics via
data_text['topic'] = ""
for row in range(len(data_text['text'])):
data_text['topic'][row] = topics[np.argmax(W[row])][0][0]
To extend the example in the question, if the [1] index row value of data_text['text'][1] is "My computer has an issue of turning off intermittently" the W[1][0][0] matrix array may be [0.5412, 0.0201, 0.0]. Since the highest value is in the first column, this sentence should be mapped to the first topic (i.e., 'problem' topic). The text assignment of this topic is assigned to data_text['topic'][1] value via topics[np.argmax(W[row])][0][0]

Tokenization and lemmatization for TF-IDF use for bunch of txt files using NLTK library

Doing the text analysis of italian text (tokenization, lemmalization) for future use of TF-IDF technics and constructing clusters based on that. For preprocessing I use NLTK and for one text file everything is working fine:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
it_stop_words = nltk.corpus.stopwords.words('italian')
lmtzr = WordNetLemmatizer()
with open('3003.txt', 'r' , encoding="latin-1") as myfile:
data=myfile.read()
word_tokenized_list = nltk.tokenize.word_tokenize(data)
word_tokenized_no_punct = [str.lower(x) for x in word_tokenized_list if x not in string.punctuation]
word_tokenized_no_punct_no_sw = [x for x in word_tokenized_no_punct if x not in it_stop_words]
word_tokenized_no_punct_no_sw_no_apostrophe = [x.split("'") for x in word_tokenized_no_punct_no_sw]
word_tokenized_no_punct_no_sw_no_apostrophe = [y for x in word_tokenized_no_punct_no_sw_no_apostrophe for y in x]
word_tokenize_list_no_punct_lc_no_stowords_lemmatized = [lmtzr.lemmatize(x) for x in word_tokenized_no_punct_no_sw_no_apostrophe]
But the question is that I need to perform the following to bunch of .txt files in the folder. For that I'm trying to use possibilities of PlaintextCorpusReader():
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
corpusdir = 'reports/'
newcorpus = PlaintextCorpusReader(corpusdir, '.txt')
Basically I can not just apply newcorpus into the previous functions because it's an object and not a string. So my questions are:
How should the functions look like (or how should I change the existing ones for a distinct file) for doing tokenization and lemmatization for a corpus of files (using PlaintextCorpusReader())
How would the TF-IDF approach (standard sklearn approach of vectorizer = TfidfVectorizer() will look like in PlaintextCorpusReader()
Many Thanks!
I think your question can be answered by reading: this question, this another one and [TfidfVectorizer docs][3]. For completeness, I wrapped the answers below:
First, you want to get the files ids, by the first question you can get them as follows:
ids = newcorpus.fileids()
Then, based on the second quetion you can retrieve documents' words, sentences or paragraphs:
doc_words = []
doc_sents = []
doc_paras = []
for id_ in ids:
# Get words
doc_words.append(newcorpus.words(id_))
# Get sentences
doc_sents.append(newcorpus.sents(id_))
# Get paragraph
doc_paras.append(newcorpus.paras(id_))
Now, on the ith position of doc_words, doc_sents and doc_paras you have all words, sentences and paragraphs (respectively) for every document in the corpus.
For tf-idf you probably just want the words. Since TfidfVectorizer.fit's method gets an iterable which yields str, unicode or file objects, you need to either transform your documents (array of tokenized words) into a single string, or use a similar approach to this one. The latter solution uses a dummy tokenizer to deal directly with arrays of words.
You can also pass your own tokenizer to TfidVectorizer and use PlaintextCorpusReader simply for file reading.

UnpicklingError was unhandled by user code : invalid load key,'%'

I have referred the website https://radimrehurek.com/gensim/tut2.html. I have come across the error UnpicklingError was unhandled by user code : invalid load key,'%'. How do I clear that error? I had referred the other queries and included the klepto package but still that error persists. I am using anacoanda2. This is the code:-
import logging
import xml.etree.cElementTree
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
import os
import klepto
from gensim import corpora
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
from pprint import pprint # pretty-printer
pprint(texts)
dictionary = corpora.Dictionary(texts)
dictionary.save_as_text('/tmp/deerwester.dict') # store the dictionary, for future reference
print(dictionary)
print(dictionary.token2id)
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.dict', corpus) # store to disk, for later use
for c in corpus:
print(c)
class MyCorpus(object):
def __iter__(self):
for line in open('/datasets/mycorpus.txt'):
# assume there's one document per line, tokens separated by whitespace
yield dictionary.doc2bow(line.lower().split())
corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
print(corpus_memory_friendly)
for vector in corpus_memory_friendly: # load one vector into memory at a time
print(vector)
from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('/datasets/mycorpus.txt'))
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify()
print(dictionary)
# create a toy corpus of 2 documents, as a plain Python list
corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it
corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
corpus = corpora.MmCorpus('/tmp/corpus.mm')
print(corpus)
# one way of printing a corpus: load it entirely into memory
print(list(corpus)) # calling list() will convert any sequence to a plain Python list
# another way of doing it: print one document at a time, making use of the streaming interface
for doc in corpus:
print(doc)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
import gensim
import numpy as np
numpy_matrix = np.random.randint(10, size=[5,2])
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
numpy_matrix_dense = gensim.matutils.corpus2dense(corpus, num_terms=10)
import scipy.sparse
scipy_sparse_matrix = scipy.sparse.random(5,2)
corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)
from gensim import corpora, models, similarities
if (os.path.exists("/tmp/deerwester.dict")):
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm')
print("Used files generated from first tutorial")
else:
print("Please run first tutorial to generate data set")
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
print(doc)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
lsi.print_topics(2)
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
print(doc)
lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('/tmp/model.lsi')
model = models.TfidfModel(corpus, normalize=True)
model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus
lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model
model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
lsi_vec = model[tfidf_vec]
model = models.RpModel(tfidf_corpus, num_topics=500)
model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
model = models.HdpModel(corpus, id2word=dictionary)

Spam filter using Python

I`m trying to make a simple spam filter using python 2.7 and scikit-learn. So, I have a set of letters for train and a set of letters for test. Firstly, I want to vectorize training set and fit logistic regression using it, then vectorize each letter in test set and put them into classifier separately.
import codecs
import json
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
def classify(mail, vectorizer, logreg):
vect_mail = vectorizer.transform(mail)
res = logreg.predict(vect_mail)
return res
def make_output(test_dir, vectorizer, logreg):
with codecs.open('test.txt', 'w', 'utf-8') as out:
for f in os.listdir(test_dir):
mail = json.load(open(os.path.join(test_dir, f)), 'utf-8')
result = classify(mail['body'].encode('ascii','ignore'), vectorizer, logreg)
out.write(u'%s\t%s\n' % (f, result))
def read_train(train_dir):
for f in os.listdir(train_dir):
with open(os.path.join(train_dir, f), 'r') as fo:
mail = json.load(fo, 'utf-8')
yield mail
if __name__ == '__main__':
train_mails = list(read_train('spam_data/train'))
corpus = list()
is_spam = list()
for mail in train_mails:
corpus.append(mail['body'].encode('ascii','ignore'))
is_spam.append(mail['is_spam'])
vectorizer = CountVectorizer()
cnt_vect = vectorizer.fit_transform(corpus)
logreg = linear_model.LogisticRegression()
logreg.fit(cnt_vect, is_spam)
make_output('spam_data/test', vectorizer, logreg)
But res = logreg.predict(vect_mail) returns a list, not one meaning. So, I guess, predictor interprets vect_mail like sample of documents of one word, not like a document with many words. How should I rewrite this code?
According to the sklearn's documentation, CountVectorizer.transform accepts not a single document to transform, but an iterable of documents. Since a string in Python is an iterable of its characters, transform generates as many "documents" as there are characters in the string.
In order to fix this issue, pass a single-element list to the transform:
vect_mail = vectorizer.transform([mail])

Categories

Resources