I am having a problem with naive bayes classifier, I am trying to analyse some sentences but I have some errors in python.
from naiveBayesClassifier.trainedData import TrainedData
class Trainer(object):
"""docstring for Trainer"""
def __init__(self, tokenizer):
super(Trainer, self).__init__()
self.tokenizer = tokenizer
self.data = TrainedData()
def train(self, text, className):
"""
enhances trained data using the given text and class
"""
self.data.increaseClass(className)
tokens = self.tokenizer.tokenize(text)
for token in tokens:
token = self.tokenizer.remove_stop_words(token)
token = self.tokenizer.remove_punctuation(token)
self.data.increaseToken(token, className)
That's the error in the console: Does anyone knows how to fix the problem? Thanks
tokens = self.tokenizer.tokenize(text)
AttributeError: module 'naiveBayesClassifier.tokenizer' has no attribute 'tokenize'
Thats the main class:
from naiveBayesClassifier import tokenizer
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier
postTrainer = Trainer(tokenizer)
postsSet = [
{'text': 'not to eat too much is not enough to lose weight', 'category': 'health'},
{'text': 'Russia try to invade Ukraine', 'category': 'politics'},
{'text': 'do not neglect exercise', 'category': 'health'},
{'text': 'Syria is the main issue, Obama says', 'category': 'politics'}
]
for post in postsSet:
postTrainer.train(post['text'], post['category'])
postClassifier = Classifier(postTrainer.data, tokenizer)
classification = postClassifier.classify("Obama is")
print(classification)
Related
I am trying to retrieve Twitter data using Tweepy, using that below code, but I'm having difficulties in collecting media_fields data. Especially, I want to get the type of media, but I failed.
As you can see below, the value is copied and exists in the cell that should be empty.
[enter image description here][1]
import tweepy
from twitter_authentication import bearer_token
import time
import pandas as pd
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
hoax_tweets = []
for response in tweepy.Paginator(client.search_all_tweets,
query = 'Covid hoax -is:retweet lang:en',
user_fields = ['username', 'public_metrics', 'description', 'location','verified','entities'],
tweet_fields=['id', 'in_reply_to_user_id', 'referenced_tweets', 'context_annotations',
'source', 'created_at', 'entities', 'geo', 'withheld', 'public_metrics',
'text'],
media_fields=['media_key', 'type', 'url', 'alt_text',
'public_metrics','preview_image_url'],
expansions=['author_id', 'in_reply_to_user_id', 'geo.place_id',
'attachments.media_keys','referenced_tweets.id','referenced_tweets.id.author_id'],
place_fields=['id', 'name', 'country_code', 'place_type', 'full_name', 'country',
'geo', 'contained_within'],
start_time = '2021-01-20T00:00:00Z',
end_time = '2021-01-21T00:00:00Z',
max_results=100):
time.sleep(1)
hoax_tweets.append(response)
result = []
user_dict = {}
media_dict = {}
# Loop through each response object
for response in hoax_tweets:
# Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
for user in response.includes['users']:
user_dict[user.id] = {'username': user.username,
'followers': user.public_metrics['followers_count'],
'tweets': user.public_metrics['tweet_count'],
'description': user.description,
'location': user.location,
'verified': user.verified
}
for media in response.includes['media']:
media_dict[tweet.id] = {'media_key':media.media_key,
'type':media.type
}
for tweet in response.data:
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'username': author_info['username'],
'author_followers': author_info['followers'],
'author_tweets': author_info['tweets'],
'author_description': author_info['description'],
'author_location': author_info['location'],
'author_verified':author_info['verified'],
'tweet_id': tweet.id,
'text': tweet.text,
'created_at': tweet.created_at,
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
'quote_count': tweet.public_metrics['quote_count'],
'in_reply_to_user_id':tweet.in_reply_to_user_id,
'media':tweet.attachments,
'media_type': media,
'conversation':tweet.referenced_tweets
})
# Change this list of dictionaries into a dataframe
df = pd.DataFrame(result)
Also, when I change the code ''media':tweet.attachments' to 'media':tweet.attachments[0] to get 'media_key' data, I get the following error message."TypeError: 'NoneType' object is not subscriptable"
What am I doing wrong? Any suggestions would be appreciated.
[1]: https://i.stack.imgur.com/AxCcl.png
The subscriptable error comes from the fact that tweet.attachments is None, from here the NoneType part. To make it work, you can add a check for None:
'media':tweet.attachments[0] if tweet.attachments else None
I have never used the twitter API, but one thing is to make sure the tweet attachments are always present or if they may be absent.
I used the code below to add custom Lookups to a custom Lanuage class:
def create_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", LOOKUP)
lookups.add_table("lemma_rules", json_to_dict('lemma_rules.json'))
lookups.add_table("lemma_index", json_to_dict('lemma_index.json'))
lookups.add_table("lemma_exc", json_to_dict('lemma_exc.json'))
return lookups
def json_to_dict(filename):
location = os.path.realpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))
with open(os.path.join(location, filename)) as f_in:
return json.load(f_in)
#CustomeLanguage.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "lookup", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
):
lemmatizer = Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
lemmatizer.lookups = create_lookups()
return lemmatizer
But when I instantiate the CustomLanguage there is no lookup table in nlp.vocab.lookups. What is the problem and how can I solve it?
The lemmatizer lookups are no longer in the vocab. They're stored in the lemmatizer component under nlp.get_pipe("lemmatizer").lookups instead.
If your lemmatizer factory creates the lemmatizer like this, anyone loading the model will need to have these JSON files available or the model won't load. (The lookup tables are saved in the model, but your make_lemmatizer method just hasn't been written with this in mind.)
Instead, create a custom lemmatizer class that loads these tables in its initialize method and then your code would look like this to add a lemmatizer and load its tables once.
nlp = spacy.blank("lg")
nlp.add_pipe("lemmatizer").initialize()
nlp.to_disk("/path/to/model")
Once you've run initialize() once for the lemmatizer, the tables are saved with the model directory and you don't need to run it again when you reload the model.
It could look something like this, which would also allow you to pass in a Lookups object to initialize instead if you'd prefer:
class CustomLemmatizer(Lemmatizer):
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
nlp: Optional[Language] = None,
lookups: Optional[Lookups] = None,
):
if lookups is None:
self.lookups = create_lookups()
else:
self.lookups = lookups
When attempting to find the entities in a long input of text, Google Cloud's natural language program is grouping together words and then getting their incorrect entity. Here is my program:
def entity_recognizer(nouns):
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/superaitor/Downloads/link"
text = ""
for words in nouns:
text += words + " "
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
document = types.Document(
content=text.encode('utf-8'),
type=enums.Document.Type.PLAIN_TEXT)
encoding = enums.EncodingType.UTF32
if sys.maxunicode == 65535:
encoding = enums.EncodingType.UTF16
entity = client.analyze_entities(document, encoding).entities
entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
for entity in entity:
#if entity_type[entity.type] is "PERSON":
print(entity_type[entity.type])
print(entity.name)
Here nouns is a list of words. I then turn that into a string(i've tried multiple ways of doing so, all give the same result), but yet the program spits out output like:
PERSON
liberty secularism etching domain professor lecturer tutor royalty
government adviser commissioner
OTHER
business view society economy
OTHER
business
OTHER
verge industrialization market system custom shift rationality
OTHER
family kingdom life drunkenness college student appearance income family
brink poverty life writer variety attitude capitalism age process
production factory system
Any input on how to fix this?
To analyze entities in a text you can use a sample from the documentation which looks something like this:
import argparse
import sys
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import six
def entities_text(text):
"""Detects entities in the text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
# Instantiates a plain text document.
document = types.Document(
content=text,
type=enums.Document.Type.PLAIN_TEXT)
# Detects entities in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
entities = client.analyze_entities(document).entities
# entity types from enums.Entity.Type
entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
for entity in entities:
print('=' * 20)
print(u'{:<16}: {}'.format('name', entity.name))
print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
print(u'{:<16}: {}'.format('metadata', entity.metadata))
print(u'{:<16}: {}'.format('salience', entity.salience))
print(u'{:<16}: {}'.format('wikipedia_url',
entity.metadata.get('wikipedia_url', '-')))
entities_text("Donald Trump is president of United States of America")
The output of this sample is:
====================
name : Donald Trump
type : PERSON
metadata : <google.protobuf.pyext._message.ScalarMapContainer object at 0x7fd9d0125170>
salience : 0.9564903974533081
wikipedia_url : https://en.wikipedia.org/wiki/Donald_Trump
====================
name : United States of America
type : LOCATION
metadata : <google.protobuf.pyext._message.ScalarMapContainer object at 0x7fd9d01252b0>
salience : 0.04350961744785309
wikipedia_url : https://en.wikipedia.org/wiki/United_States
As you can see in this example, Entity Analysis inspects the given text for known entities (proper nouns such as public figures, landmarks, etc.). It's not gonna provide you entity for each word in the text.
Instead of classifying according to entities, I would use Google default categories directly, changing
entity = client.analyze_entities(document, encoding).entities
to
categories = client.classify_text(document).categories
and consequently up-dating the code. I wrote the following sample code based on this tutorial, further developed in github.
def run_quickstart():
# [START language_quickstart]
# Imports the Google Cloud client library
# [START migration_import]
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
# [END migration_import]
# Instantiates a client
# [START migration_client]
client = language.LanguageServiceClient()
# [END migration_client]
# The text to analyze
text = u'For its part, India has said it will raise taxes on 29 products imported from the US - including some agricultural goods, steel and iron products - in retaliation for the wide-ranging US tariffs.'
document = types.Document(
content=text,
type=enums.Document.Type.PLAIN_TEXT)
# Detects the sentiment of the text
sentiment = client.analyze_sentiment(document=document).document_sentiment
# Classify content categories
categories = client.classify_text(document).categories
# User category feedback
for category in categories:
print(u'=' * 20)
print(u'{:<16}: {}'.format('name', category.name))
print(u'{:<16}: {}'.format('confidence', category.confidence))
# User sentiment feedback
print('Text: {}'.format(text))
print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))
# [END language_quickstart]
if __name__ == '__main__':
run_quickstart()
Does this solution works for you? If not, why?
I am using Stanford Core NLP using Python.I have taken the code from here.
Following is the code :
from stanfordcorenlp import StanfordCoreNLP
import logging
import json
class StanfordNLP:
def __init__(self, host='http://localhost', port=9000):
self.nlp = StanfordCoreNLP(host, port=port,
timeout=30000 , quiet=True, logging_level=logging.DEBUG)
self.props = {
'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation,sentiment',
'pipelineLanguage': 'en',
'outputFormat': 'json'
}
def word_tokenize(self, sentence):
return self.nlp.word_tokenize(sentence)
def pos(self, sentence):
return self.nlp.pos_tag(sentence)
def ner(self, sentence):
return self.nlp.ner(sentence)
def parse(self, sentence):
return self.nlp.parse(sentence)
def dependency_parse(self, sentence):
return self.nlp.dependency_parse(sentence)
def annotate(self, sentence):
return json.loads(self.nlp.annotate(sentence, properties=self.props))
#staticmethod
def tokens_to_dict(_tokens):
tokens = defaultdict(dict)
for token in _tokens:
tokens[int(token['index'])] = {
'word': token['word'],
'lemma': token['lemma'],
'pos': token['pos'],
'ner': token['ner']
}
return tokens
if __name__ == '__main__':
sNLP = StanfordNLP()
text = r'China on Wednesday issued a $50-billion list of U.S. goods including soybeans and small aircraft for possible tariff hikes in an escalating technology dispute with Washington that companies worry could set back the global economic recovery.The country\'s tax agency gave no date for the 25 percent increase...'
ANNOTATE = sNLP.annotate(text)
POS = sNLP.pos(text)
TOKENS = sNLP.word_tokenize(text)
NER = sNLP.ner(text)
PARSE = sNLP.parse(text)
DEP_PARSE = sNLP.dependency_parse(text)
I am only interested in Entity Recognition which is being saved in the variable NER. The command NER is giving the following result
The same thing if I run on Stanford Website, the output for NER is
There are 2 problems with my Python Code:
1. '$' and '50-billion' should be combined and named a single entity.
Similarly, I want '25' and 'percent' as a single entity as it is showing in the online stanford output.
2. In my output, 'Washington' is shown as State and 'China' is shown as Country. I want both of them to be shown as 'Loc' as in the stanford website output. The possible solution to this problem lies in the documentation .
But I don't know which model am I using and how to change the model.
Here is a way you can solve this
Make sure to download Stanford CoreNLP 3.9.1 and the necessary models jars
Set up the server properties in this file "ner-server.properties"
annotators = tokenize,ssplit,pos,lemma,ner
ner.applyFineGrained = false
Start the server with this command:
java -Xmx12g edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000 -serverProperties ner-server.properties
Make sure you've installed this Python package:
https://github.com/stanfordnlp/python-stanford-corenlp
Run this Python code:
import corenlp
client = corenlp.CoreNLPClient(start_server=False, annotators=["tokenize", "ssplit", "pos", "lemma", "ner"])
sample_text = "Joe Smith was born in Hawaii."
ann = client.annotate(sample_text)
for mention in ann.sentence[0].mentions:
print([x.word for x in ann.sentence[0].token[mention.tokenStartInSentenceInclusive:mention.tokenEndInSentenceExclusive]])
Here are all the fields available in the EntityMention for each entity:
sentenceIndex: 0
tokenStartInSentenceInclusive: 5
tokenEndInSentenceExclusive: 7
ner: "MONEY"
normalizedNER: "$5.0E10"
entityType: "MONEY"
Let's say I have the following class:
import webbrowser
class Management(object):
def add_accounts(self, data):
operations = []
for account_name in data:
operations.append(
{'operator': 'ADD',
'operand': {'name': account_name}
})
self.added = operations
manager = Management()
manager.add_accounts(['name 1', 'name 2'])
What I want to do is add this function:
def source():
url = r'http://www.stackoverflow.com/some-help-doc'
webbrowser.open(url, new=1)
to the add_accounts method so I can type the following:
manager.add_accounts.source()
and have it open my default browser to the help article online:
I've been searching for how to add a method to an already existing method in a class. Is there a name for what I'm trying to do?
As #BrenBam pointed out in comments, methods do have attributes and those can be anything, including functions. However, this'll make for some weird un-Pythonic code. If you want this method to show some kind of documentation (as example suggests), it'll be better to just copypaste information to a docstring. Docstring is where everyone expects info to be.
This seems like the best option for now:
import webbrowser
class Management(object):
def __init__(self):
Management.add_accounts.source = lambda url='https://github.com/': webbrowser.open(url, new=1)
def add_accounts(self, data):
operations = []
for account_name in data:
operations.append(
{'operator': 'ADD',
'operand': {'name': account_name}
})
self.added = operations
manager = Management()