i have used this code for category detection..
import numpy as np
# Words -> category
categories = {word: key for key, words in data.items() for word in words}
# Load the whole embedding matrix
embeddings_index = {}
with open('glove.6B.100d.txt', encoding="utf8") as f:
for line in f:
values = line.split()
word = values[0]
embed = np.array(values[1:], dtype=np.float32)
embeddings_index[word] = embed
print('Loaded %s word vectors.' % len(embeddings_index))
# Embeddings for available words
data_embeddings = {key: value for key, value in embeddings_index.items() if key in categories.keys()}
# Processing the query
def process(query):
query_embed = embeddings_index[query]
scores = {}
for word, embed in data_embeddings.items():
category = categories[word]
dist = query_embed.dot(embed)
dist /= len(data[category])
scores[category] = scores.get(category, 0) + dist
return scores
# Testing
print(process('pizza'))
OUTPUT
{'service': 6.385544379552205, 'ambiance': 3.5752111077308655, 'Food': 12.912149047851562}
is there a way I only get the highest accuracy category like Food??
def process(query):
query_embed = embeddings_index[query]
scores = {}
for word, embed in data_embeddings.items():
category = categories[word]
dist = query_embed.dot(embed)
dist /= len(data[category])
scores[category] = scores.get(category, 0) + dist
return max(scores, key=scores.get)
You can use max() for this. This will return the key name of maximum value.
Related
I am trying to adapt TF-IDF on my data ([using the code by Dr. W.J.B. Mattingly: https://github.com/wjbmattingly/topic_modeling_textbook/blob/main/lessons/02_tf_idf_official.py) on my data - descriptions of the startups from Startup blink website.
I cannot get the main idea on how to better deal with the extraction of all words as now the output is the string with all words all together like this - also you will notice lots of empty lists inside as well:
[['qualitygeotechnicalinvestigationtestinggeotechnicalreportspreconditiondevelopmentideasnewprojectimplementationintensivefieldlaboratorytestingsnecessaryobtaininputdatasoillayerscapacitysettlementcategorizationqualitymaterials']
s = requests.Session()
df = pd.DataFrame()
for p in tqdm(range(2000)):
r = s.get(f'https://www.startupblink.com/api/entities?entity=startups&page={p}')
d = pd.json_normalize(r.json()['page'])
df = pd.concat([df, d], axis=0, ignore_index=True)
df.to_csv('World_startups.csv')
# selecting only ESG related startups
esg = df[df['subindustry_name'].isin(['Energy', 'Energy & Environment-Other', 'Smart Cities', 'Smart Home', 'Public Transportation', 'Sustainability',
'Transportation-Other','Waste Management'])]
esg = esg[['title', 'description', 'subindustry_name']]
description = esg.description.tolist()
#description = description.remove(np.nan)
def remove_stopwords(text, stops):
words = text.split()
final = []
for word in words:
if word not in stops:
final.append(word)
final = "".join(final)
final = final.translate(str.maketrans("", "", string.punctuation))
final = "".join([i for i in final if not i.isdigit()])
while " " in final:
final = final.replace(" ", " ")
return final
def clean_docs(docs):
stops = stopwords.words('english')
final = []
for doc in docs:
clean_doc = remove_stopwords(doc, stops)
final.append(clean_doc)
return (final)
cleaned_docs = clean_docs(description)
vectorizer = TfidfVectorizer(lowercase=True,
max_features=100,
# max_df=.9,# percentage
# min_df=2, # number of
ngram_range=(1,3),
stop_words = 'english') # up to triagrams
vectors = vectorizer.fit_transform(cleaned_docs)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
# Printing all unique dense values to mid-check
densearray = numpy.array(denselist)
print(numpy.unique(densearray))
all_keywords = []
for d in denselist:
x=0
keywords = []
for word in d:
if word > 0:
keywords.append(feature_names[x])
x=x+1
all_keywords.append(keywords)
all_keywords[7]
print(len(all_keywords))
# the list contains lots of emptly lists inside - will remove them
all_keywords = [ele for ele in all_keywords if ele != []]
print('')
print(len(all_keywords))
print(all_keywords[7])
For a given corpus of tokenized texts, I want to perform word weighing with several weighing techniques. To do so, I created the following class:
class Weighing:
def __init__(self, input_file, word_weighing):
self.input_file_ = input_file #List in which each element is a list of tokens
self.word_weighing_ = word_weighing
self.num_documents = len(self.input_file_)
#Set with all unique words from the corpus
self.vocabulary = set()
for text in self.input_file_:
self.vocabulary.update(text)
self.vocabulary_size = len(self.vocabulary)
#Create dictionary that returns index for a token or token for an index of the corpus' vocabulary
self.word_to_index = dict()
self.index_to_word = dict()
for i, word in enumerate(self.vocabulary):
self.word_to_index[word] = i
self.index_to_word[i] = word
#Create sparse Document-Term Matrix (DTM)
self.sparse_dtm = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for doc_index, document in enumerate(self.input_file_):
document_counter = Counter(document)
for word in set(document):
self.sparse_dtm[doc_index, self.word_to_index[word]] = document_counter[word] # Update element
#Get word count for all documents to calculate sparse_p_ij
self.sum_words = Counter()
for doc in self.input_file_:
self.sum_words.update(Counter(doc))
#Create probability of word i in document j. Format: sparse matrix
def create_sparse_p_ij (self):
sparse_p_ij = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for j in range(self.num_documents):
row_counts = self.sparse_dtm.getrow(j).toarray()[0]
word_index = row_counts.nonzero()[0]
non_zero_row_counts = row_counts[row_counts != 0]
for i, count in enumerate(non_zero_row_counts):
word = self.index_to_word[word_index[i]]
prob_ij = count/self.sum_words[word]
sparse_p_ij[j,word_index[i]] = prob_ij
return sparse_p_ij
#Create a binary sparse dtm. Format: sparse matrix
def create_sparse_binary_dtm(self):
binary_sparse_dtm = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for doc_index, document in enumerate(self.input_file_):
document_counter = dict.fromkeys(document, 1)
for word in set(document):
binary_sparse_dtm[doc_index, self.word_to_index[word]] = document_counter[word] # Update element
return binary_sparse_dtm
#2) Calculate Global Term weighting (4 methods: entropy, IDF, Probabilistic IDF, Normal)
def calc_entropy(self):
sparse_p_ij = self.create_sparse_p_ij()
summed_word_probabilities = sparse_p_ij.sum(0).tolist()[0]
return np.array([1+((word_probability * np.log2(word_probability))/np.log2(self.num_documents)) for word_probability in summed_word_probabilities])
def calc_idf(self):
summed_words = self.sparse_dtm.sum(0).tolist()[0]
return np.array([np.log2(self.num_documents/word_count) for word_count in summed_words])
def calc_normal(self):
summed_words = self.sparse_dtm.sum(0).tolist()[0]
return np.array([1/(math.sqrt(word_count**2)) for word_count in summed_words])
def calc_probidf (self):
binary_sparse_dtm = self.create_sparse_binary_dtm()
summed_binary_words_list = binary_sparse_dtm.sum(0).tolist()[0]
return np.array([np.log2((self.num_documents - binary_word_count)/binary_word_count) for binary_word_count in summed_binary_words_list])
if self.word_weighing_ == 1:
gtw = self.calc_entropy()
elif self.word_weighing_ == 2:
gtw = self.calc_idf()
elif self.word_weighing_ == 3:
gtw = self.calc_normal()
elif self.word_weighing_ == 4:
gtw = self.calc_probidf()
Now, when I run:
model = Weighing(input_file = data_list,
word_weighing = 1)
With data_list is a list of lists with tokenized words.
I get the following error:
Traceback (most recent call last):
File "<ipython-input-621-b0a9caec82d4>", line 4, in <module>
word_weighing = 1)
File "<ipython-input-617-6f3fdcecd170>", line 90, in __init__
gtw = self.calc_entropy()
AttributeError: 'Weighing' object has no attribute 'calc_entropy'
I looked at a few other similar SO links[1,2,3,4], but none of these seem applicable here.
What can I do to overcome this error?
EDIT:
I've updated the code to:
class Weighing:
def __init__(self, input_file, word_weighing):
self.input_file_ = input_file #List in which each element is a list of tokens
self.word_weighing_ = word_weighing
self.num_documents = len(self.input_file_)
#Set with all unique words from the corpus
self.vocabulary = set()
for text in self.input_file_:
self.vocabulary.update(text)
self.vocabulary_size = len(self.vocabulary)
#Create dictionary that returns index for a token or token for an index of the corpus' vocabulary
self.word_to_index = dict()
self.index_to_word = dict()
for i, word in enumerate(self.vocabulary):
self.word_to_index[word] = i
self.index_to_word[i] = word
#Create sparse Document-Term Matrix (DTM)
self.sparse_dtm = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for doc_index, document in enumerate(self.input_file_):
document_counter = Counter(document)
for word in set(document):
self.sparse_dtm[doc_index, self.word_to_index[word]] = document_counter[word] # Update element
if self.word_weighing_ == 1:
self.gtw = self.calc_entropy()
elif self.word_weighing_ == 2:
self.gtw = self.calc_idf()
elif self.word_weighing_ == 3:
self.gtw = self.calc_normal()
elif self.word_weighing_ == 4:
self.gtw = self.calc_probidf()
#Get word count for all documents to calculate sparse_p_ij
self.sum_words = Counter()
for doc in self.input_file_:
self.sum_words.update(Counter(doc))
#Create probability of word i in document j. Format: sparse matrix
def create_sparse_p_ij (self):
sparse_p_ij = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for j in range(self.num_documents):
row_counts = self.sparse_dtm.getrow(j).toarray()[0]
word_index = row_counts.nonzero()[0]
non_zero_row_counts = row_counts[row_counts != 0]
for i, count in enumerate(non_zero_row_counts):
word = self.index_to_word[word_index[i]]
prob_ij = count/self.sum_words[word]
sparse_p_ij[j,word_index[i]] = prob_ij
return sparse_p_ij
#Create a binary sparse dtm. Format: sparse matrix
def create_sparse_binary_dtm(self):
binary_sparse_dtm = dok_matrix((self.num_documents, self.vocabulary_size), dtype=np.float32)
for doc_index, document in enumerate(self.input_file_):
document_counter = dict.fromkeys(document, 1)
for word in set(document):
binary_sparse_dtm[doc_index, self.word_to_index[word]] = document_counter[word] # Update element
return binary_sparse_dtm
#2) Calculate Global Term weighting (4 methods: entropy, IDF, Probabilistic IDF, Normal)
def calc_entropy(self):
sparse_p_ij = self.create_sparse_p_ij()
summed_word_probabilities = sparse_p_ij.sum(0).tolist()[0]
return np.array([1+((word_probability * np.log2(word_probability))/np.log2(self.num_documents)) for word_probability in summed_word_probabilities])
def calc_idf(self):
summed_words = self.sparse_dtm.sum(0).tolist()[0]
return np.array([np.log2(self.num_documents/word_count) for word_count in summed_words])
def calc_normal(self):
summed_words = self.sparse_dtm.sum(0).tolist()[0]
return np.array([1/(math.sqrt(word_count**2)) for word_count in summed_words])
def calc_probidf (self):
binary_sparse_dtm = self.create_sparse_binary_dtm()
summed_binary_words_list = binary_sparse_dtm.sum(0).tolist()[0]
return np.array([np.log2((self.num_documents - binary_word_count)/binary_word_count) for binary_word_count in summed_binary_words_list])
However, I still get the error:
AttributeError: 'Weighing' object has no attribute 'calc_entropy'
Now, I call a function before I have initialized it. How can I change my code so that I initialize the def calc_entropy before I initialize the self.gtw?
It seems to be an indentation problem: You define your method functions like calc_entropy() within your __init__() function and not within your class.
It should be:
class Weighing:
def __init__(self):
# your init
def calc_entropy(self):
# your method
I am working on an information retrieval project, where I have to process a ~1.5 GB text data and create a Dictionary (words, document frequency) and posting list (document id, term frequency). According to the professor, it should take around 10-15 minutes. But my code is running for more than 8 hours now! I tried a smaller dataset (~35 MB) and it took 5 hours to process.
I am a newbie in python and I think it is taking so long because i have created many python dictionaries and lists in my code. I tried to use generator, but I am not sure how to work around with it.
file = open(filename, 'rt')
text = file.read()
file.close()
p = r'<P ID=\d+>.*?</P>'
tag = RegexpTokenizer(p)
passage = tag.tokenize(text)
doc_re = re.compile(r"<P ID=(\d+)>")
def process_data(docu):
tokens = RegexpTokenizer(r'\w+')
lower_tokens = [word.lower() for word in tokens.tokenize(docu)]
table = str.maketrans('','', string.punctuation)
stripped = [w.translate(table) for w in lower_tokens]
alpha = [word for word in stripped if word.isalpha()]
stopwordlist = stopwords.words('english')
stopped = [w for w in alpha if not w in stopwordlist]
return stopped
data = {}
for doc in passage:
group_docID = doc_re.match(doc)
docID = group_docID.group(1)
tokens = process_data(doc)
data[docID] = list(set(tokens))
vocab = [item for i in data.values() for item in i]
total_vocab = list(set(vocab))
total_vocab.sort()
print('Document Size = ', len(data))
print('Collection Size = ', len(vocab))
print('Vocabulary Size= ', len(total_vocab))
inv_index = {}
for x in total_vocab:
for y, z in data.items():
if x in z:
wordfreq = z.count(x)
inv_index.setdefault(x, []).append((int(y), wordfreq))
flattend = [item for tag in inv_index.values() for item in tag]
posting = [item for tag in flattend for item in tag ]
doc_freq=[]
for k,v in inv_index.items():
freq1=len([item for item in v if item])
doc_freq.append((freq1))
#offset value of each vocabulary/words
offset = []
offset1=0
for i in range(len(doc_freq)):
if i>0:
offset1 =offset1 + (doc_freq[i-1]*2)
offset.append((offset1))
#create dcitionary of words, document frequency and offset
dictionary = {}
for i in range(len(total_vocab)):
dictionary[total_vocab[i]]=(doc_freq[i],offset[i])
#dictionary of word, inverse document frequency
idf = {}
for i in range(len(dictionary)):
a = np.log2(len(data)/doc_freq[i])
idf[total_vocab[i]] = a
with open('dictionary.json', 'w') as f:
json.dump(dictionary,f)
with open('idf.json', 'w') as f:
json.dump(idf, f)
binary_file = open('binary_file.txt', 'wb')
for i in range(0, len(posting)):
binary_int = (posting[i]).to_bytes(4, byteorder = 'big')
binary_file.write(binary_int)
binary_file.close()
Could someone please help me to rewrite this code so that it becomes more computationally and time efficient?
I want to create a very basic Q&A chatbot. Given a list of questions & answers that I use as my dataset, I want to train it in order to return relevant answers, depending on a hard-coded question (different every time). First I tokenize, cleaning up, then using cosine similarity, but it gives me an error, which is (I guess) a pickle issue.
UPDATED
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import pickle
import os.path
import re, math
tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words('english')
extra_stopwords = stopwords + ['I', 'can']
WORD = re.compile(r'\w+')
def get_clean_data():
clean_data_set = {
'questions' : {},
'answers' : {}
}
reader = csv.reader(open('data.csv', 'r', encoding="utf-8"))
tags = []
counter = 0
for r in reader:
question = str(r[0].encode('utf-8'))
answer = str(r[1].encode('utf-8'))
_, tags_question = get_tags(question)
_, tags_answer = get_tags(answer)
clean_data_set['answers'][answer] = tags_answer + tags_question
clean_data_set['questions'][question] = text_to_vector(question)
counter += 1
# hardcode the number :)
print (counter, ' out of 746')
# pickle.dump(clean_data_set, open('dump.dict', 'wb'))
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
def get_tags(text, use_set = True):
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
# make it lower case
filtered_words = [word.lower() for word in tokens if word not in extra_stopwords]
# return non duplicate values by default
if use_set == True:
filterd_words = list(set(filtered_words))
return Counter(filtered_words), filtered_words
# simple cosine similarity measure
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
# question_set is the data we had
def get_cosine_value(question, question_set):
question_vector = text_to_vector(question)
cosine = get_cosine(question_vector, question_set)
return cosine
def answer_question(question, top = 5):
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
# data_set = pickle.load(open('dump.dict', 'rb'))
pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
_, question_tags = get_tags(question)
ranking_dict = {}
similar_questions_rank = {}
for entry in data_set['answers']:
tags = data_set['answers'][entry]
# rank is the intersection between the list of tags from the question
# and the list of tags associated to answers
rank = len(set(question_tags).intersection(tags))
ranking_dict[entry] = rank
for entry in data_set['questions']:
cosine_similarity = get_cosine_value(question, data_set['questions'][entry])
similar_questions_rank[entry] = cosine_similarity
sorted_similarity_dict = sorted(similar_questions_rank.items(), key=lambda x: x[1], reverse=True)
sorted_ranking_dict = sorted(ranking_dict.items(), key=lambda x: x[1], reverse=True)
# sort them by rank
for item in sorted_ranking_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Answer: ', item[0])
print ('\n\n')
# sort them by rank
for item in sorted_similarity_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Question: ', item[0])
#get_clean_data()
question = 'why all these errors?'
answer_question(question)
This is the updated error message:
Traceback (most recent call last):
File "C:\Users\joasa\Desktop\si\main.py", line 133, in <module>
answer_question(question)
File "C:\Users\joasa\Desktop\si\main.py", line 94, in answer_question
data_set = pickle.load(my_dump_file)
EOFError: Ran out of input
[Finished in 1.4s]
Can someone help please? I have no idea what to do. Thanks in advance
I think it comes from this line in your get_clean_data function:
pickle.dump(clean_data_set, open('dump.dict', 'w'))
See here you open the file for writing, but you never close it, so when you try to read it there is nothing to signify that the end of file has been reached. To avoid stuff like this from happening, use a context manager block:
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
That way, whichever way you exit the with block, you are guaranteed to close your file.
You should also do the same when loading your pickle dump in answer_question:
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
i'm beginner in machine learning and i'm trying to implement my first Naive Bayes by myself for better understanding. So, i have dataset from http://archive.ics.uci.edu/ml/datasets/Adult (american census data, classes are '<=50k' and '>50k').
Here is my python code:
#!/usr/bin/python
import sys
import csv
words_stats = {} # {'word': {'class1': cnt, 'class2': cnt'}}
words_cnt = 0
targets_stats = {} # {'class1': 3234, 'class2': 884} how many words in each class
class_stats = {} # {'class1': 7896, 'class2': 3034} how many lines in each class
items_cnt = 0
def train(dataset, targets):
global words_stats, words_cnt, targets_stats, items_cnt, class_stats
num = len(dataset)
for item in xrange(num):
class_stats[targets[item]] = class_stats.get(targets[item], 0) + 1
for i in xrange(len(dataset[item])):
word = dataset[item][i]
if not words_stats.has_key(word):
words_stats[word] = {}
tgt = targets[item]
cnt = words_stats[word].get(tgt, 0)
words_stats[word][tgt] = cnt + 1
targets_stats[tgt] = targets_stats.get(tgt, 0) + 1
words_cnt += 1
items_cnt = num
def classify(doc, tgt_set):
global words_stats, words_cnt, targets_stats, items_cnt
probs = {} #the probability itself P(c|W) = P(W|c) * P(c) / P(W)
pc = {} #probability of the class in document set P(c)
pwc = {} #probability of the word set in particular class. P(W|c)
pw = 1 #probability of the word set in documet set
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
pw = pw * float(sum(words_stats[word].values())) / words_cnt
for tgt in tgt_set:
pc[tgt] = class_stats[tgt] / float(items_cnt)
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
tgt_wrd_cnt = words_stats[word].get(tgt, 0)
pwc[tgt] = pwc.get(tgt, 1) * float(tgt_wrd_cnt) / targets_stats[tgt]
probs[tgt] = (pwc[tgt] * pc[tgt]) / pw
l = sorted(probs.items(), key = lambda i: i[1], reverse=True)
print probs
return l[0][0]
def check_results(dataset, targets):
num = len(dataset)
tgt_set = set(targets)
correct = 0
incorrect = 0
for item in xrange(num):
res = classify(dataset[item], tgt_set)
if res == targets[item]:
correct = correct + 1
else:
incorrect = incorrect + 1
print 'correct:', float(correct) / num, ' incorrect:', float(incorrect) / num
def load_data(fil):
data = []
tgts = []
reader = csv.reader(fil)
for line in reader:
d = [x.strip() for x in line]
if '?' in d:
continue
if not len(d):
continue
data.append(d[:-1])
tgts.append(d[-1:][0])
return data, tgts
if __name__ == '__main__':
if len(sys.argv) < 3:
print './program train_data.txt test_data.txt'
sys.exit(1)
filename = sys.argv[1]
fil = open(filename, 'r')
data, tgt = load_data(fil)
train(data, tgt)
test_file = open(sys.argv[2], 'r')
test_data, test_tgt = load_data(test_file)
check_results(test_data, tgt)
it gives ~61% of correct results. when i print probabilities i get the following:
{'<=50K': 0.07371606889800396, '>50K': 15.325378327213354}
but in case of correct classifier i expect to see sum of both probabilities equal to 1.
At first i thought the problem is in float underflow and tried to make all calculations in logarithms, but results were similiar.
i understand that omitting some words is gonna affect accuracy, but the probabilities are sooo wrong.
What do i do wrong or don't understand?
for your convinience i've uploaded dataset and python script here:
https://dl.dropboxusercontent.com/u/36180992/adult.tar.gz
Thank you for your help.
Naive Bayes doesn't compute a probability directly, rather it computes a "raw score" that is relatively compared to the other scores for each label in order to classify an instance. This score can easily be converted to a "probability" in the range of [0, 1]:
total = sum(probs.itervalues())
for label, score in probs.iteritems():
probs[label] = score / total
However, keep in mind this still doesn't represent a true probability, as mentioned in this answer:
naive Bayes tends to predict probabilities that are almost always either very close to zero or very close to one.