I'm trying to use babelNet instead of wordnet because I have special combined words from the computer science domain, my code is to find similarities for a list of topics (lst).
code:
from babelnetpy.babelnet import BabelNet
bn = BabelNet("")
lst = ['artificial_intelligence', "real_time", 'Big_data', "Healthcare",'Fuzzy_logic']
def get_bn_main_sense(synsetid):
"""
get main sense
"""
return bn.getSynsets(synsetid)[0].mainSense
get_bn_main_sense('bn:03890554n')
def get_synset(a_list):
synset_list = []
for word in a_list:
a = bn.getSynset_Ids(word, "en")[:1] #The index is to ensure each word gets assigned 1st synset only
synset_list.append(a)
return synset_list
lst_synsets = get_synset(lst)
def bn_syns(given_list):
synset_bn = []
for word in given_list:
synset_bn.append("bn.[%s]" % word)
return synset_bn
lst_s_bn = bn_syns(lst_synsets)
def lower(list):
new_list = []
for word in list:
new_list.append(word.lower())
return new_list
lower_is = lower(lst_s_bn)
def clean(a_list):
new_list = []
for word in a_list:
b = word.replace("[", "").replace("]", "").replace("bn.synset", "").replace("(", "").replace(")", "").replace("'", "")
new_list.append(b)
return new_list
clean_is = clean(lower_is)
# id of the synset you want to retrieve
artificial_intelligence_Ids = bn.getSynset_Ids("artificial_intelligence", "en")
artificial_intelligence=artificial_intelligence_Ids[0]
real_time_Ids = bn.getSynset_Ids("real_time", "en")
real_time=real_time_Ids[0]
Big_data_Ids = bn.getSynset_Ids("Big_data", "en")
Big_data=Big_data_Ids[0]
Healthcare_Ids = bn.getSynset_Ids("Healthcare", "en")
Healthcare=Healthcare_Ids[0]
Fuzzy_logic_Ids = bn.getSynset_Ids("Fuzzy_logic", "en")
Fuzzy_logic=Fuzzy_logic_Ids[0]
#is_variables
artificial_Intelligence_syn = get_bn_main_sense('bn:03890554n')
real_time_syn = get_bn_main_sense('bn:01258457n')
Big_data_syn = get_bn_main_sense('bn:02063206n')
Healthcare_syn = get_bn_main_sense('bn:00043361n')
Fuzzy_logic_syn = get_bn_main_sense('bn:15130487n')
is_variables = [artificial_Intelligence_syn, real_time_syn, Big_data_syn, Healthcare_syn,Fuzzy_logic_syn]
# wup similarity
def similarity(list1, list2):
sim_dict = {}
for syn in list1:
for sin in list2:
sim = (syn).wup_similarity(sin)
if sim >= 0.5:
sim_dict.update({(syn, sin): sim})
return sim_dict
b_s = similarity(is_variables, is_variables)
I get the error "AttributeError: 'str' object has no attribute 'wup_similarity'" when I try to run this code to find a semantic similarity for this list (last) !!!
. any suggestions or hints are highly appreciated.
The returned values of the function get_bn_main_sense are of type string. str doesn't have any method named wup_similarity. mainSense is of type string:
>>> type(bn.getSynsets('bn:03890554n')[0].mainSense)
<class 'str'>
I couldn't find any object or class with this method or algorithm in the babelnetpy library.
NLTK library does have this method but uses Wordnet. Maybe you have confused these two libraries.
Related
For studying purposes, I've tried to implement this "lesson" using python but "without" sckitlearn or something similar.
My attempt code is the follow:
import pandas, math
training_data = [
['A great game','Sports'],
['The election was over','Not sports'],
['Very clean match','Sports'],
['A clean but forgettable game','Sports'],
['It was a close election','Not sports']
]
text_to_predict = 'A very close game'
data_frame = pandas.DataFrame(training_data, columns=['data','label'])
data_frame = data_frame.applymap(lambda s:s.lower() if type(s) == str else s)
text_to_predict = text_to_predict.lower()
labels = data_frame.label.unique()
word_frequency = data_frame.data.str.split(expand=True).stack().value_counts()
unique_words_set = set()
unique_words = data_frame.data.str.split().apply(unique_words_set.update)
total_unique_words = len(unique_words_set)
word_frequency_per_labels = []
for l in labels:
word_frequency_per_label = data_frame[data_frame.label == l].data.str.split(expand=True).stack().value_counts()
for w, f in word_frequency_per_label.iteritems():
word_frequency_per_labels.append([w,f,l])
word_frequency_per_labels_df = pandas.DataFrame(word_frequency_per_labels, columns=['word','frequency','label'])
laplace_smoothing = 1
results = []
for l in labels:
p = []
total_words_in_label = word_frequency_per_labels_df[word_frequency_per_labels_df.label == l].frequency.sum()
for w in text_to_predict.split():
x = (word_frequency_per_labels_df.query('word == #w and label == #l').frequency.to_list()[:1] or [0])[0]
p.append((x + laplace_smoothing) / (total_words_in_label + total_unique_words))
results.append([l,math.prod(p)])
print(results)
result = pandas.DataFrame(results, columns=['labels','posterior']).sort_values('posterior',ascending = False).labels.iloc[0]
print(result)
In the blog lesson their results are:
But my result were:
[['sports', 4.607999999999999e-05], ['not sports', 1.4293831139825827e-05]]
So, what did I do wrong in my python implementation? How can I get the same results?
Thanks in advance
You haven't multiplied by the priors p(Sport) = 3/5 and p(Not Sport) = 2/5. So just updating your answers by these ratios will get you to the correct result. Everything else looks good.
So for example you implement p(a|Sports) x p(very|Sports) x p(close|Sports) x p(game|Sports) in your math.prod(p) calculation but this ignores the term p(Sport). So adding this in (and doing the same for the not sport condition) fixes things.
In code this can be achieved by:
prior = (data_frame.label == l).mean()
results.append([l,prior*math.prod(p)])
the answer by #nick is correct and should be awarded the bounty.
Here an alternative implementation (from scratch, not using pandas) that also supports normalization of probabilities and words not in training set
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, Set
def tokenize(text: str):
return [word.lower() for word in text.split()]
def normalize(result: Dict[str, float]):
total = sum([v for v in result.values()])
for k in result.keys():
result[k] /= total
#dataclass
class Model:
labels: Set[str] = field(default_factory=set)
words: Set[str] = field(default_factory=set)
prob_labels: Dict[str,float] = field(default_factory=lambda: defaultdict(float)) # P(label)
prob_words: Dict[str,Dict[str,float]] = field(default_factory=lambda: defaultdict(lambda: defaultdict(float))) # P(word | label) as prob_words[label][word]
def predict(self, text: str, norm=True) -> Dict[str, float]: # P(label | text) as model.predict(text)[label]
result = {label: self.prob_labels[label] for label in self.labels}
for word in tokenize(text):
for label in self.labels:
if word in self.words:
result[label] *= self.prob_words[label][word]
if norm:
normalize(result)
return result
def train(self, data):
prob_words_denominator = defaultdict(int)
for row in data:
text = row[0]
label = row[1].lower()
self.labels.add(label)
self.prob_labels[label] += 1.0
for word in tokenize(text):
self.words.add(word)
self.prob_words[label][word] += 1.0
prob_words_denominator[label] += 1.0
for label in self.labels:
self.prob_labels[label] /= len(data)
for word in self.words:
self.prob_words[label][word] = (self.prob_words[label][word] + 1.0) / (prob_words_denominator[label] + len(self.words))
training_data = [
['A great game','Sports'],
['The election was over','Not sports'],
['Very clean match','Sports'],
['A clean but forgettable game','Sports'],
['It was a close election','Not sports']
]
text_to_predict = 'A very close game'
model = Model()
model.train(training_data)
print(model.predict(text_to_predict, norm=False))
print(model.predict(text_to_predict))
print(model.predict("none of these words is in training data"))
output:
{'sports': 2.7647999999999997e-05, 'not sports': 5.7175324559303314e-06}
{'sports': 0.8286395560004286, 'not sports': 0.1713604439995714}
{'sports': 0.6, 'not sports': 0.4}
I have created a program which reads a file line into a list. When i run the code below, i see that there is a list with elements inside it.
dogs_list_player = []
dogs_list_computer = []
with open("dogs.txt") as f:
for i in range(Y):
dogs_list_player.append(f.readline().splitlines())
print(dogs_list_player)
for i in range(Z):
dogs_list_computer.append(f.readline().splitlines())
print(dogs_list_computer)
The result is:
[['Tim']]
[['Tim'], ['Bob']]
[['Tim'], ['Bob'], ['Jess']]
[['Tim'], ['Bob'], ['Jess'], ['Bess']]
[['Tim'], ['Bob'], ['Jess'], ['Bess'], ['Tess']]
[['Dom']]
[['Dom'], ['Tom']]
[['Dom'], ['Tom'], ['Will']]
[['Dom'], ['Tom'], ['Will'], ['Ben']]
[['Dom'], ['Tom'], ['Will'], ['Ben'], ['Joe']]
But the issue arises when i add this part of code:
dogs_list_player = []
dogs_list_computer = []
with open("dogs.txt") as f:
for i in range(Y):
dogs_list_player.append(f.readline().splitlines())
print(dogs_list_player)
for i in range(Z):
dogs_list_computer.append(f.readline().splitlines())
print(dogs_list_computer)
class Dog_card:
name = ""
friendliness = ""
intelligence = ""
exercise = ""
drool = ""
def printing_card(self):
prnt_str = "Name:%s \nIntelligence:%s \nExercise:%s \nDrool:%s" %(self.name, self.friendliness, self.intelligence, self.exercise, self.drool)
return prnt_str
player_card = Dog_card()
card_count = 0
player.name = dogs_list_player[0]#i think this is where the issue is happening
the result of this code is:
IndexError: list index out of range
Any help would be appreciated
your Dog_card class has some errors :
you forget "friendliness" parameter in the string that is in printing_card method and also you have not any constructor in your class .
class Dog_card:
def __init__(self):
self.name = ""
self.friendliness = ""
self.intelligence = ""
self.exercise = ""
self.drool = ""
def printing_card(self):
prnt_str = "Name:%s \nIntelligence:%s friendliness:%s \nExercise:%s \nDrool:%s" %(self.name, self.friendliness, self.intelligence, self.exercise, self.drool)
return prnt_str
this shoud be work .
I'm not entirely sure why im getting a dictionary key error. I'm trying to create a multi level dict with = sign and getting a key error on metrics, but not on the first two.
doc['timestamp']
and
doc['instance_id']
both work fine, but when it gets to metrics it gives me a metrics key error. I'm not entirely sure why.
doc = {}
doc['timestamp'] = datetime.now()
#doc['instance_id'] = get_cloud_app_name()
doc['instance_id'] = "MyMac"
cpu_dict_returned = get_cpu_info()
doc['metrics']['cpu_usage']['user_cpu'] = cpu_dict_returned['user_cpu']
doc['metrics']["cpu_usage"]['system_cpu'] = cpu_dict_returned['system_cpu']
doc['metrics']["cpu_usage"]['idle_cpu'] = cpu_dict_returned['idle_cpu']
doc['metrics']["cpu_usage"]['cpu_count'] = cpu_dict_returned['cpu_count']
You must create the sub-dictionnaries before using them:
doc = {}
doc['timestamp'] = datetime.now()
doc['instance_id'] = "MyMac"
cpu_dict_returned = get_cpu_info()
doc['metrics'] = {}
doc['metrics']['cpu_usage'] = {}
doc['metrics']['cpu_usage']['user_cpu'] = cpu_dict_returned['user_cpu']
doc['metrics']["cpu_usage"]['system_cpu'] = cpu_dict_returned['system_cpu']
doc['metrics']["cpu_usage"]['idle_cpu'] = cpu_dict_returned['idle_cpu']
doc['metrics']["cpu_usage"]['cpu_count'] = cpu_dict_returned['cpu_count']
You can do this more succinctly using a dictionary comprehension:
doc = {}
doc['timestamp'] = datetime.now()
doc['instance_id'] = "MyMac"
cpu_dict_returned = get_cpu_info()
doc['metrics'] = {
'cpu_usage':
{k: cpu_dict_returned.get(k)
for k in ['user_cpu', 'system_cpu', 'idle_cpu', 'cpu_count']}
}
Note that the sub dictionary cpu_usage is first created, and then the nested dictionary is inserted.
I'm reading a list of sentences and tagging each word with NLTK's Stanford POS tagger. I get outputs like so:
wordnet_sense = []
for o in output:
a = st.tag(o)
wordnet_sense.append(a)
outputs: [[(u'feel', u'VB'), (u'great', u'JJ')], [(u'good', u'JJ')]]
I want to map these words with their POS, so that they are recognised in WordNet.
I've attempted this:
sense = []
for i in wordnet_sense:
tmp = []
for tok, pos in i:
lower_pos = pos[0].lower()
if lower_pos in ['a', 'n', 'v', 'r', 's']:
res = wn.synsets(tok, lower_pos)
if len(res) > 0:
a = res[0]
else:
a = "[{0}, {1}]".format(tok, pos)
tmp.append(a)
sense.append(tmp)
print sense
outputs: [Synset('feel.v.01'), '[great, JJ]'], ['[good, JJ]']]
So feel is recognised as a verb, but great and good are not recognised as adjectives. I've also checked if great and good actually belong in Wordnet because I thought they weren't being mapped if they weren't there, but they are. Can anyone help?
Here's a cute function from pywsd:
from nltk.corpus import wordnet as wn
def penn2morphy(penntag, returnNone=False):
morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
'VB':wn.VERB, 'RB':wn.ADV}
try:
return morphy_tag[penntag[:2]]
except:
return None if returnNone else ''
def wordnet_pos_code(tag):
if tag.startswith('NN'):
return wn.NOUN
elif tag.startswith('VB'):
return wn.VERB
elif tag.startswith('JJ'):
return wn.ADJ
elif tag.startswith('RB'):
return wn.ADV
else:
return ''
print wordnet_pos_code('NN')`
As well as the answer provided, I've found this that also works.
I would like to create a bunch of empty lists with names such as:
author1_count = []
author2_count = []
...
...
and so on...but a priori I do not know how many lists I need to generate.
Answers to question similar this one suggest to create a dictionary as in (How to create multiple (but individual) empty lists in Python?) or an array of lists. However, I wish to append values to the list as in:
def search_list(alist, aname):
count = 0
author_index = 0
author_list = alist
author_name = aname
for author in author_list:
if author == author_name:
author_index = author_list.index(author)+1
count = 1
return count, author_index
cehw_list = ["Ford, Eric", "Mustang, Jason", "BMW, James", "Mercedes, Megan"]
author_list = []
for author in authors:
this_author = author.encode('ascii', 'ignore')
author_list.append(this_author)
# Find if the author is in the authorlist
for cehw in cehw_list:
if cehw == cehw_list[0]:
count0, position0 = search_list(author_list, cehw)
author1_count.append(count0)
elif cehw == cehw_list[1]:
count1, position1 = search_list(author_list, cehw)
author2_count.append(count1)
...
...
Any idea how to create such distinct lists. Is there an elegant way to do this?
Dictionaries! You only need to be more specific when appending values, e.g.
author_lists = {}
for i in range(3):
author_lists['an'+str(i)] = []
author_lists
{'an0': [], 'an1': [], 'an2': []}
author_lists['an0'].append('foo')
author_lists
{'an0': ['foo'], 'an1': [], 'an2': []}
You should be able to use a dictionary still.
data = {}
for cehw in cehw_list:
count0, position0 = search_list(author_list, cehw)
# Or whatever property on cehw that has the unique identifier
if cehw in data:
data[cehw].append(count0)
else:
data[cehw] = [count0]