I'm running the following python script on a large dataset (around 100 000 items). Currently the execution is unacceptably slow, it would probably take a month to finish at least (no exaggeration). Obviously I would like it to run faster.
I've added a comment belong to highlight where I think the bottleneck is. I have written my own database functions which are imported.
Any help is appreciated!
# -*- coding: utf-8 -*-
import database
from gensim import corpora, models, similarities, matutils
from gensim.models.ldamulticore import LdaMulticore
import pandas as pd
from sklearn import preprocessing
def getTopFiveSimilarAuthors(author, authors, ldamodel, dictionary):
vec_bow = dictionary.doc2bow([researcher['full_proposal_text']])
vec_lda = ldamodel[vec_bow]
# normalization
try:
vec_lda = preprocessing.normalize(vec_lda)
except:
pass
similar_authors = []
for index, other_author in authors.iterrows():
if(other_author['id'] != author['id']):
other_vec_bow = dictionary.doc2bow([other_author['full_proposal_text']])
other_vec_lda = ldamodel[other_vec_bow]
# normalization
try:
other_vec_lda = preprocessing.normalize(vec_lda)
except:
pass
sim = matutils.cossim(vec_lda, other_vec_lda)
similar_authors.append({'id': other_author['id'], 'cosim': sim})
similar_authors = sorted(similar_authors, key=lambda k: k['cosim'], reverse=True)
return similar_authors[:5]
def get_top_five_similar(author, authors, ldamodel, dictionary):
top_five_similar_authors = getTopFiveSimilarAuthors(author, authors, ldamodel, dictionary)
database.insert_top_five_similar_authors(author['id'], top_five_similar_authors, cursor)
connection = database.connect()
authors = []
authors = pd.read_sql("SELECT id, full_text FROM author WHERE full_text IS NOT NULL;", connection)
# create the dictionary
dictionary = corpora.Dictionary([authors["full_text"].tolist()])
# create the corpus/ldamodel
author_text = []
for text in author_text['full_text'].tolist():
word_list = []
for word in text:
word_list.append(word)
author_text.append(word_list)
corpus = [dictionary.doc2bow(text) for text in author_text]
ldamodel = LdaMulticore(corpus, num_topics=50, id2word = dictionary, workers=30)
#BOTTLENECK: the script hangs after this point.
authors.apply(lambda x: get_top_five_similar(x, authors, ldamodel, dictionary), axis=1)
I noticed these problems in your code.. but I'm not sure the they are the reason for the slow execution..
this loop here is useless it well never run:
for text in author_text['full_text'].tolist():
word_list = []
for word in text:
word_list.append(word)
author_text.append(word_list)
also there is no need to loop the words of the text it is enough to use split function on it and it will be a list of words, by lopping authors courser..
try to write it like this:
first:
all_authors_text = []
for author in authors:
all_authors_text.append(author['full_text'].split())
and after that make the dictionary:
dictionary = corpora.Dictionary(all_authors_text)
Related
I'm trying to rank task sentences of an occupation using bm25. I'm following this tutorial, but getting to this part I get confused "Ranking of documents
Now that we've created our document indexes, we can give it queries and see which documents are the most relevant" i want that the queries be every sentence that i have at my corpus column. How can i do that?
!pip install rank_bm25
import pandas as pd from rank_bm25 import BM25Okapi import string
corpus = pd.read_excel(r'/content/Job-occ.xlsx')
tokenized_corpus = [doc.split(" ") for doc in corpus['task']]
tokenized_corpus = [] for doc in corpus['task']:
print(doc)
doc_tokens = doc.split()
tokenized_corpus.append(doc_tokens)
bm25 = BM25Okapi(tokenized_corpus)
here is my data
Basically you just need to iterate over your list of documents, for example like this:
import pandas as pd
from rank_bm25 import BM25Okapi
import string
def argsort(seq, reverse):
# http://stackoverflow.com/questions/3071415/efficient-method-to-calculate-the-rank-vector-of-a-list-in-python
return sorted(range(len(seq)), key=seq.__getitem__, reverse =reverse)
corpus = pd.read_excel(r'Job-occ.xlsx')
tokenized_corpus = [doc.split(" ") for doc in corpus['task']]
bm25 = BM25Okapi(tokenized_corpus)
for doc_as_query in tokenized_corpus:
scores = bm25.get_scores(doc_as_query)
# show top 3 (optional):
print('\nQuery:',' '.join(doc_as_query))
top_similar = argsort(scores, True)
for i in range(3):
print(' top', i+1,':', corpus['task'][top_similar[i]])
I added sorting and printing the top 3 similar documents, in case you want this (that's why I added the function argsort).
I have two .txt files, one that contains 200.000 words and the second contains 100 keywords( one each line). I want to calculate the cosine similarity between each of the 100 keywords and each word of my 200.000 words , and display for every keyword the 50 words with the highest score.
Here's what I did, note that Bertclient is what i'm using to extract vectors :
from sklearn.metrics.pairwise import cosine_similarity
from bert_serving.client import BertClient
bc = BertClient()
# Process words
with open("./words.txt", "r", encoding='utf8') as textfile:
words = textfile.read().split()
with open("./100_keywords.txt", "r", encoding='utf8') as keyword_file:
for keyword in keyword_file:
vector_key = bc.encode([keyword])
for w in words:
vector_word = bc.encode([w])
cosine_lib = cosine_similarity(vector_key,vector_word)
print (cosine_lib)
This keeps running but it doesn't stop. Any idea how I can correct this ?
I know nothing of Bert...but there's something fishy with the import and run. I don't think you have it installed correctly or something. I tried to pip install it and just run this:
from sklearn.metrics.pairwise import cosine_similarity
from bert_serving.client import BertClient
bc = BertClient()
print ('done importing')
and it never finished. Take a look at the dox for bert and see if something else needs to be done.
On your code, it is generally better do do ALL of the reading first, then the processing, so import both lists first, separately, check a few values with something like:
# check first five
print(words[:5])
Also, you need to look at a different way to do your comparisons instead of the nested loops. You realize now that you are converting each word in words EVERY TIME for each keyword, which is not necessary and probably really slow. I would recommend you either use a dictionary to pair the word with the encoding or make a list of tuples with the (word, encoding) in it if you are more comfortable with that.
Comment me back if that doesn't makes sense after you get Bert up and running.
--Edit--
Here is a chunk of code that works similar to what you want to do. There are a lot of options for how you can hold results, etc. depending on your needs, but this should get you started with "fake bert"
from operator import itemgetter
# fake bert ... just return something like length
def bert(word):
return len(word)
# a fake compare function that will compare "bert" conversions
def bert_compare(x, y):
return abs(x-y)
# Process words
with open("./word_data_file.txt", "r", encoding='utf8') as textfile:
words = textfile.read().split()
# Process keywords
with open("./keywords.txt", "r", encoding='utf8') as keyword_file:
keywords = keyword_file.read().split()
# encode the words and put result in dictionary
encoded_words = {}
for word in words:
encoded_words[word] = bert(word)
encoded_keywords = {}
for word in keywords:
encoded_keywords[word] = bert(word)
# let's use our bert conversions to find which keyword is most similar in
# length to the word
for word in encoded_words.keys():
result = [] # make a new result set for each pass
for kword in encoded_keywords.keys():
similarity = bert_compare(encoded_words.get(word), encoded_keywords.get(kword))
# stuff the answer into a tuple that can be sorted
result.append((word, kword, similarity))
result.sort(key=itemgetter(2))
print(f'the keyword with the closest size to {result[0][0]} is {result[0][1]}')
I am trying to build a Phrases model over a big corpus but I keep stumbling over a memory error.
First I tried to fit my entire corpus into a big generator.
Then, I tried to save the model between each document :
import codecs
import gensim
import os
import random
import string
import sys
def gencorp(file_path):
with codecs.open(file_path, 'rb',encoding="utf8") as doc :
for sentence in doc:
yield sentence.split()
out_corpus_dir = "C:/Users/Administrator/Desktop/word2vec/1billionwords_corpus_preprocessed/"
file_nb = 0
bi_detector = gensim.models.Phrases()
for file in os.listdir(out_corpus_dir):
file_nb += 1
file_path = out_corpus_dir+file
bi_detector.add_vocab(gencorp(file_path))
bi_detector.save("generic_EN_bigrams_v%i"%(file_nb/10))
bi_detector = gensim.models.Phrases.load("generic_EN_bigrams_v%i"%(file_nb/10))
bi_detector.save("generic_EN_bigrams")
But none of these solutions work. However, generic_EN_bigrams_v0 is generated and saved.
So I am wondering if I can train a Phrases model per document and then find a way to merge them after.
Thanks you for any insight :)
According to the gensim's documentation, adding sentences should simply work, and you shouldn't have a memory problem since it's updating the statistics only. Therefore, this minor modification to your code should make it work, i.e. you don't need to recreate the bi_detector object.
import codecs
import gensim
import os
import random
import string
import sys
def gencorp(file_path):
with codecs.open(file_path, 'rb',encoding="utf8") as doc :
for sentence in doc:
yield sentence.split()
out_corpus_dir = "C:/Users/Administrator/Desktop/word2vec/1billionwords_corpus_preprocessed/"
file_nb = 0
bi_detector = gensim.models.Phrases()
for file in os.listdir(out_corpus_dir):
file_nb += 1
file_path = out_corpus_dir+file
bi_detector.add_vocab(gencorp(file_path))
# The following two lines are not required.
# bi_detector.save("generic_EN_bigrams_v%i"%(file_nb/10))
# bi_detector = gensim.models.Phrases.load("generic_EN_bigrams_v%i"%(file_nb/10))
bi_detector.save("generic_EN_bigrams")
I would like to calculate the cosine similarity for the consecutive pairs of articles in a JSON file. So far I manage to do it but.... I just realize that when transforming the tfidf of each article I am not using the terms from all articles available in the file but only those from each pair. Here is the code that I am using which provides the cosine-similarity coefficient of each consecutive pair of articles.
import json
import nltk
with open('SDM_2015.json') as f:
data = [json.loads(line) for line in f]
## Loading the packages needed:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
## Defining our functions to filter the data
# Short for stemming each word (common root)
stemmer = nltk.stem.porter.PorterStemmer()
# Short for removing puctuations etc
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
## First function that creates the tokens
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
## Function that incorporating the first function, converts all words into lower letters and removes puctuations maps (previously specified)
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
## Lastly, a super function is created that contains all the previous ones plus stopwords removal
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
## Calculation one by one of the cosine similatrity
def foo(x, y):
tfidf = vectorizer.fit_transform([x, y])
return ((tfidf * tfidf.T).A)[0,1]
my_funcs = {}
for i in range(len(data) - 1):
x = data[i]['body']
y = data[i+1]['body']
foo.func_name = "cosine_sim%d" % i
my_funcs["cosine_sim%d" % i] = foo
print(foo(x,y))
Any idea of how to develop the cosine-similarity using the whole terms of all articles available in the JSON file rather than only those of each pair?
Kind regards,
Andres
I think, based on our discussion above, you need to change the foo function and everything below. See the code below. Note that I haven't actually run this, since I don't have your data and no sample lines are provided.
## Loading the packages needed:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import json
from sklearn.metrics.pairwise import cosine_similarity
with open('SDM_2015.json') as f:
data = [json.loads(line) for line in f]
## Defining our functions to filter the data
# Short for stemming each word (common root)
stemmer = nltk.stem.porter.PorterStemmer()
# Short for removing puctuations etc
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
## First function that creates the tokens
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
## Function that incorporating the first function, converts all words into lower letters and removes puctuations maps (previously specified)
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
## tfidf
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
tfidf_data = vectorizer.fit_transform(data)
#cosine dists
similarity matrix = cosine_similarity(tfidf_data)
I am using a map reduce implementation called mincemeat.py. It contains a map function and reduce function. First off I will tell what I am trying to accomplish. I am doing a coursera course on bigdata where there is a programming assignment. The question is that there are hundreds of files containing data of the form paperid:::author1::author2::author3:::papertitle
We have to go through all the files and give for a particular author, the word he has used to the maximum. So I wrote the following code for it.
import re
import glob
import mincemeat
from collections import Counter
text_files = glob.glob('test/*')
def file_contents(file_name):
f = open(file_name)
try:
return f.read()
finally:
f.close()
datasource = dict((file_name, file_contents(file_name)) for file_name in text_files)
def mapfn(key, value):
for line in value.splitlines():
wordsinsentence = line.split(":::")
authors = wordsinsentence[1].split("::")
# print authors
words = str(wordsinsentence[2])
words = re.sub(r'([^\s\w-])+', '', words)
# re.sub(r'[^a-zA-Z0-9: ]', '', words)
words = words.split(" ")
for author in authors:
for word in words:
word = word.replace("-"," ")
word = word.lower()
yield author, word
def reducefn(key, value):
return Counter(value)
s = mincemeat.Server()
s.datasource = datasource
s.mapfn = mapfn
s.reducefn = reducefn
results = s.run_server(password="changeme")
# print results
i = open('outfile','w')
i.write(str(results))
i.close()
My problem now is that, reduce function has to receive authorname and all the words he has used in his titles, for all authors. So I expected an output like
{authorname: Counter({'word1':countofword1,'word2':countofword2,'word3':countofword3,..}).
But what I get is
authorname: (authorname, Counter({'word1': countofword1,'word2':countofword2}))
Can someone tell why it is happening like that? I don't need help to solve the question, I need help to know why it is happening like that!
I ran your code and I see it is working as expected. The output looks like {authorname : Counter({'word1':countofword1,'word2':countofword2,'word3':countofword3,..}).
That said. Remove the code from here as it violates Coursera Code of Honor.
Check your value data structure in reducefn before Counter.
def reducefn(key, value):
print(value)
return Counter(value)