Use trained model sentiment on the dataframe python - python

I have a dataframe
0 i only need uxy to hit 20 eod to make up for a...
1 oh this isn’t good
2 lads why is my account covered in more red ink...
3 i'm tempted to drop my last 800 into some stup...
4 the sell offs will continue until moral improves.
I want to apply NLP for each comment to identify which one is positive and which one is negative.
Here is what I have
import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews
from random import shuffle
from nltk import FreqDist
from nltk.corpus import stopwords
import string
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.tokenize import word_tokenize
df = pd.read_csv("/home/yan/PycharmProjects/pythonProject/comments_binary.csv")
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
words = movie_reviews.words(fileid)
pos_reviews.append(words)
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
words = movie_reviews.words(fileid)
neg_reviews.append(words)
stopwords_english = stopwords.words('english')
def bag_of_words(words):
words_clean = []
for word in words:
word = word.lower()
if word not in stopwords_english and word not in string.punctuation:
words_clean.append(word)
words_dictionary = dict([word, True] for word in words_clean)
return words_dictionary
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
pos_reviews_set.append((bag_of_words(words), 'pos'))
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
neg_reviews_set.append((bag_of_words(words), 'neg'))
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
custom_review = "I am pretty sure that TSLA will hit 500 today after open"
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
print (classifier.classify(custom_review_set)) # Output: pos
I am confused how would I apply the whole function for each row with text and create a separate column with pos and neg text that would describe a certain comment.
I tried to create a function
def my_classification(x):
return classifier.classify(x)
df["new_column"] = df["text"].apply(my_classification)
But it says AttributeError: 'str' object has no attribute 'copy'
I would highly appreciate your help

Related

Assign Topic from NNMF Topic Modelling

I have a list of text comments that are fed into a non-negative matrix factorization topic modelling program.
import pandas as pd
import numpy as np
# load the data
import csv
with open('C:\\...\\comments.csv', newline='') as f:
reader = csv.reader(f)
next(reader) # skip header
df = [tuple(row) for row in reader]
# set the number of topics
total_topics = 3
# process the data
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
data_text = pd.DataFrame(df,columns=['text'])
# remove stopwords and tokenize the text
custom_stops = ["stopword1", "stopword2", "stopword3"]
data_text['filtered_text'] = data_text['text'].apply(lambda x: remove_stopwords(x.lower()))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: str.split(x))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: [item for item in x if item.lower() not in custom_stops])
CORPUS = pd.DataFrame(data_text['filtered_text'])
# Remove empty strings
CORPUS.dropna(inplace=True)
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
# lemmatize the text
for index,entry in enumerate(CORPUS['filtered_text']):
# Declaring Empty List to store the words that follow the rules for this step
Final_words = []
# Initializing WordNetLemmatizer()
word_Lemmatized = WordNetLemmatizer()
# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
for word, tag in pos_tag(entry):
# Below condition is to check for Stop words and consider only alphabets
if word not in stopwords.words('english') and word.isalpha():
word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
Final_words.append(word_Final)
# The final processed set of words for each iteration will be stored in 'text_final'
CORPUS.loc[index,'text_final'] = str(Final_words)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def build_feature_matrix(documents, feature_type='frequency'):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=1,ngram_range=(1, 1))
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=1,ngram_range=(1, 1))
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1))
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix
# create a feature matrix
vectorizer, tfidf_matrix = build_feature_matrix(CORPUS['text_final'], feature_type='tfidf')
td_matrix = tfidf_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)
from sklearn.decomposition import NMF
nmf = NMF(n_components=total_topics, random_state=42, alpha=.1, l1_ratio=.5)
nmf.fit(tfidf_matrix)
def get_topics_terms_weights(weights, feature_names):
feature_names = np.array(feature_names)
sorted_indices = np.array([list(row[::-1])
for row
in np.argsort(np.abs(weights))])
sorted_weights = np.array([list(wt[index])
for wt, index
in zip(weights,sorted_indices)])
sorted_terms = np.array([list(feature_names[row])
for row
in sorted_indices])
topics = [np.vstack((terms.T,
term_weights.T)).T
for terms, term_weights
in zip(sorted_terms, sorted_weights)]
return topics
def print_topics_udf(topics, total_topics=1,
weight_threshold=0.0001,
display_weights=False,
num_terms=None):
for index in range(total_topics):
topic = topics[index]
topic = [(term, float(wt))
for term, wt in topic]
topic = [(word, round(wt,2))
for word, wt in topic
if abs(wt) >= weight_threshold]
if display_weights:
print( 'Topic #' +str(index+1)+' with weights')
print (topic[:num_terms] if num_terms else topic)
else:
print ('Topic #'+str(index+1)+' without weights')
tw = [term for term, wt in topic]
print (tw[:num_terms] if num_terms else tw)
print()
feature_names = vectorizer.get_feature_names()
weights = nmf.components_
topics = get_topics_terms_weights(weights, feature_names)
# print topics and weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=False)
# print topics with weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=True)
# display the topics
# this takes the top term from each group and assigns it as the topic theme
for index in range(0,total_topics):
print("Topic",index+1,"=",topics[index][0][0])
The example output may be something like:
Topic 1 = problem
Topic 2 = software
Topic 3 = recommendation
How can I assign a specific comment from the file a specific topic? e.g., the comment "My computer has an issue of turning off intermittently" would be mapped to Topic 1 "problem"
The answer is to transform the document term matrix to pull out the factorized document topic matrix:
W = nmf.fit_transform(tfidf_matrix)
where the tfidf matrix = W x H, where W is the document-topic matrix and H is the topic-term matrix. Slide 25 of the link gives a good visualization of this technique:
http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf
Thus, the highest value in W for the respective comment row correlates the assigned topic. I iterated across the rows to assign this topics via
data_text['topic'] = ""
for row in range(len(data_text['text'])):
data_text['topic'][row] = topics[np.argmax(W[row])][0][0]
To extend the example in the question, if the [1] index row value of data_text['text'][1] is "My computer has an issue of turning off intermittently" the W[1][0][0] matrix array may be [0.5412, 0.0201, 0.0]. Since the highest value is in the first column, this sentence should be mapped to the first topic (i.e., 'problem' topic). The text assignment of this topic is assigned to data_text['topic'][1] value via topics[np.argmax(W[row])][0][0]

Syntax error when lemmatizing column in pandas

I am trying to lemmatize words in a particular column ('body') using pandas.
I have tried the following code, that I found here
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
df['body'] = df['body'].apply(lambda x: "".join([Word(word).lemmatize() for word in
df['body'].head()
When I attempt to run the code, I get an error message that simply says
File "<ipython-input-41-c002479904b0>", line 33
df['body'] = df['body'].apply(lambda x: "".join([Word(word).lemmatize() for word in x)
^
SyntaxError: invalid syntax
I have also tried the solution presented in this post but didn't have any luck.
UPDATE: this is the full code so far
import pandas as pd
import re
import string
df1 = pd.read_csv('RP_text_posts.csv')
df2 = pd.read_csv('RP_text_comments.csv')
# Renaming columns so the post part - currently 'selftext' matches the post variable in the comments - 'body'
df1.columns = ['author','subreddit','score','num_comments','retrieved_on','id','created_utc','body']
# Dropping columns that aren't subreddit or the post content
df1 = df1.drop(columns=['author','score','num_comments','retrieved_on','id','created_utc'])
df2 = df2.drop(labels=None, columns=['author', 'score', 'created_utc'])
# Combining data
df = pd.concat([df1, df2])
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
# Lemmatizing
df['body'] = df['body'].apply(lambda x: "".join([Word(word).lemmatize() for word in x)
df['body'].head()`
It miss the end of the lambda function:
df['words'] = df['words'].apply(lambda x: "".join([Word(word).lemmatize() for word in x]))
Update
The line should be more like that but you can only lemmatize by one pos(adjective, or verb, or ...):
df['words'] = df['body'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(x)]))
print(df.head()))
If you want more, you can try the following code:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
def nltk_tag_to_wordnet_tag(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
def lemmatize_sentence(sentence):
#tokenize the sentence and find the POS tag for each token
nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
#tuple of (token, wordnet_tag)
wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
lemmatized_sentence = []
for word, tag in wordnet_tagged:
if tag is None:
#if there is no available tag, append the token as is
lemmatized_sentence.append(word)
else:
#else use the tag to lemmatize the token
lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
return " ".join(lemmatized_sentence)
# Lemmatizing
df['words'] = df['body'].apply(lambda x: lemmatize_sentence(x))
print(df.head())
df result:
body | words
0 Best scores, good cats, it rocks | Best score , good cat , it rock
1 You received best scores | You receive best score
2 Good news | Good news
3 Bad news | Bad news
4 I am loving it | I be love it
5 it rocks a lot | it rock a lot
6 it is still good to do better | it be still good to do good

Find the number of appearances of a keyword found in a list in another list

import nltk
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
import csv
import pandas as pd
Obtain data from CSV:
address = 'filepath'
example = pd.read_csv(address)
review_column = example.Reviews
Data obtained and to be inserted into the list
reviews = []
for w in review_column:
reviews.append(w)
Further enhancement to the data in the list by removing stopwords and punctuation.
reviews = [word for word in reviews if word not in stopwords.words('english')]
reviews = [word for word in reviews if word not in punctuation]
#reviews = [word_tokenize(i) for i in reviews]
Obtain data from CSV
address = 'filepath'
example = pd.read_csv(address)
keywords_column = example.Keywords
Insert Data into List
keywords = []
for w in keywords_column:
keywords.append(w)
#keywords_tokenised = []
#keywords_tokenised = [word_tokenize(i) for i in keywords]
frequency_list = []
frequency = 0
for word in keywords:
if word == reviews:
frequency += 1
frequency_list.append((word,frequency))
else:
frequency_list.append((word,frequency))
print(frequency_list)

CSV file with label

As suggested here Python Tf idf algorithm I use this code to get the frequency of words over a set of documents.
import pandas as pd
import csv
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import codecs
def tokenize(text):
tokens = word_tokenize(text)
stems = []
for item in tokens: stems.append(PorterStemmer().stem(item))
return stems
with codecs.open("book1.txt",'r','utf-8') as i1,\
codecs.open("book2.txt",'r','utf-8') as i2,\
codecs.open("book3.txt",'r','utf-8') as i3:
# your corpus
t1=i1.read().replace('\n',' ')
t2=i2.read().replace('\n',' ')
t3=i3.read().replace('\n',' ')
text = [t1,t2,t3]
# word tokenize and stem
text = [" ".join(tokenize(txt.lower())) for txt in text]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text).todense()
# transform the matrix to a pandas df
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
# sum over each document (axis=0)
top_words = matrix.sum(axis=0).sort_values(ascending=False)
top_words.to_csv('dict.csv', index=True, float_format="%f",encoding="utf-8")
With the last line, I create a csv file where are listed all words and their frequency. Is there a way to put a label to them, to see if a word belong only to the third document, or to all?
My goal is to delete from the csv file all the words that appear only in the 3rd document (book3)
You can use the isin() attribute to filter out your top_words in the third book from the top_ words in the entire corpus.
(For the example below I downloaded three random books from http://www.gutenberg.org/)
import codecs
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# import nltk
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
def tokenize(text):
tokens = word_tokenize(text)
stems = []
for item in tokens: stems.append(PorterStemmer().stem(item))
return stems
with codecs.open("56732-0.txt",'r','utf-8') as i1,\
codecs.open("56734-0.txt",'r','utf-8') as i2,\
codecs.open("56736-0.txt",'r','utf-8') as i3:
# your corpus
t1=i1.read().replace('\n',' ')
t2=i2.read().replace('\n',' ')
t3=i3.read().replace('\n',' ')
text = [t1,t2,t3]
# word tokenize and stem
text = [" ".join(tokenize(txt.lower())) for txt in text]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text).todense()
# transform the matrix to a pandas df
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
# sum over each document (axis=0)
top_words = matrix.sum(axis=0).sort_values(ascending=False)
# top_words for the 3rd book alone
text = [" ".join(tokenize(t3.lower()))]
matrix = vectorizer.fit_transform(text).todense()
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
top_words3 = matrix.sum(axis=0).sort_values(ascending=False)
# Mask out words in t3
mask = ~top_words.index.isin(top_words3.index)
# Filter those words from top_words
top_words = top_words[mask]
top_words.to_csv('dict.csv', index=True, float_format="%f",encoding="utf-8")

NLTK: Naive Bayes - where/how to add in ngrams?

I am doing a classification task on tweets (3 labels= pos, neg, neutral), for which I'm using Naive Bayes in NLTK. I'd like to add in ngrams (bigrams) as well. I have tried adding them to the code, but I don't seem to get where to fit them right in. At the moment it seems as if I'm "breaking" the code, no matter where I add in the bigrams. Could anybody please help me out, or redirect me to a tutorial?
My code for unigrams follows. If you need any information on how the datasets look, I'd be happy to provide it.
import nltk
import csv
import random
import nltk.classify.util, nltk.metrics
import codecs
import re, math, collections, itertools
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk import bigrams
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords = True)
stopset = set(stopwords.words('english'))
stopset.add('username')
stopset.add('url')
stopset.add('percentage')
stopset.add('number')
stopset.add('at_user')
stopset.add('AT_USER')
stopset.add('URL')
stopset.add('percentagenumber')
inpTweets = []
##with open('sanders.csv', 'r', 'utf-8') as f: #input sanders
## reader = csv.reader(f, delimiter = ';')
## for row in reader:
## inpTweets.append((row))
reader = codecs.open('...sanders.csv', 'r', encoding='utf-8-sig') #input classified tweets
for line in reader:
line = line.rstrip()
row = line.split(';')
inpTweets.append((row))
def processTweet(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
tweet = re.sub('#[^\s]+','AT_USER',tweet)
tweet = re.sub('[\s]+', ' ', tweet)
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
tweet = tweet.strip('\'"')
return tweet
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
def preprocessing(doc):
tokens = tokenizer.tokenize(doc)
bla = []
for x in tokens:
if len(x)>2:
if x not in stopset:
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x)
if val is not None:
x = replaceTwoOrMore(x)
x = processTweet(x)
x = x.strip('\'"?,.')
x = stemmer.stem(x).lower()
bla.append(x)
return bla
xyz = []
for lijn in inpTweets:
xyz.append((preprocessing (lijn[0]),lijn[1]))
random.shuffle(xyz)
featureList = []
k = 0
while k in range (0, len(xyz)):
featureList.extend(xyz[k][0])
k = k + 1
fd = nltk.FreqDist(featureList)
featureList = list(fd.keys())[2000:]
def document_features(doc):
features = {}
document_words = set(doc)
for word in featureList:
features['contains(%s)' % word] = (word in document_words)
return features
featuresets = nltk.classify.util.apply_features(document_features, xyz)
training_set, test_set = featuresets[2000:], featuresets[:2000]
classifier = nltk.NaiveBayesClassifier.train(training_set)
Your code uses the 2000 most common words as the classification features. Just select the bigrams you want to use, and convert them to features in document_features(). A feature like "contains (the dog)" will work just like "contains (dog)".
An interesting approach is using a sequential backoff tagger, which allows you to chain taggers together: in this way you could train a n-gram tagger and a Naive Bayes and chain them togheter.

Categories

Resources