I am trying to lemmatize words in a particular column ('body') using pandas.
I have tried the following code, that I found here
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
df['body'] = df['body'].apply(lambda x: "".join([Word(word).lemmatize() for word in
df['body'].head()
When I attempt to run the code, I get an error message that simply says
File "<ipython-input-41-c002479904b0>", line 33
df['body'] = df['body'].apply(lambda x: "".join([Word(word).lemmatize() for word in x)
^
SyntaxError: invalid syntax
I have also tried the solution presented in this post but didn't have any luck.
UPDATE: this is the full code so far
import pandas as pd
import re
import string
df1 = pd.read_csv('RP_text_posts.csv')
df2 = pd.read_csv('RP_text_comments.csv')
# Renaming columns so the post part - currently 'selftext' matches the post variable in the comments - 'body'
df1.columns = ['author','subreddit','score','num_comments','retrieved_on','id','created_utc','body']
# Dropping columns that aren't subreddit or the post content
df1 = df1.drop(columns=['author','score','num_comments','retrieved_on','id','created_utc'])
df2 = df2.drop(labels=None, columns=['author', 'score', 'created_utc'])
# Combining data
df = pd.concat([df1, df2])
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
# Lemmatizing
df['body'] = df['body'].apply(lambda x: "".join([Word(word).lemmatize() for word in x)
df['body'].head()`
It miss the end of the lambda function:
df['words'] = df['words'].apply(lambda x: "".join([Word(word).lemmatize() for word in x]))
Update
The line should be more like that but you can only lemmatize by one pos(adjective, or verb, or ...):
df['words'] = df['body'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(x)]))
print(df.head()))
If you want more, you can try the following code:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
def nltk_tag_to_wordnet_tag(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
def lemmatize_sentence(sentence):
#tokenize the sentence and find the POS tag for each token
nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
#tuple of (token, wordnet_tag)
wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
lemmatized_sentence = []
for word, tag in wordnet_tagged:
if tag is None:
#if there is no available tag, append the token as is
lemmatized_sentence.append(word)
else:
#else use the tag to lemmatize the token
lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
return " ".join(lemmatized_sentence)
# Lemmatizing
df['words'] = df['body'].apply(lambda x: lemmatize_sentence(x))
print(df.head())
df result:
body | words
0 Best scores, good cats, it rocks | Best score , good cat , it rock
1 You received best scores | You receive best score
2 Good news | Good news
3 Bad news | Bad news
4 I am loving it | I be love it
5 it rocks a lot | it rock a lot
6 it is still good to do better | it be still good to do good
Related
First step is tokenizing the text from dataframe using NLTK. Then, I create a spelling correction using TextBlob. For this, I convert the output from tuple to string. After that, I need to lemmatize/stem (using NLTK). The problem is my output return in a strip-format. Thus, it cannot be lemmatized/stemmed.
#create a dataframe
import pandas as pd
import nltk
df = pd.DataFrame({'text': ["spellling", "was", "working cooking listening","studying"]})
#tokenization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
def tokenize(text):
return [w for w in w_tokenizer.tokenize(text)]
df["text2"] = df["text"].apply(token)
#spelling correction
def spell_eng(text):
text=TextBlob(str(text)).correct()
#convert from tuple to str
text=functools.reduce(operator.add, (text))
return text
df['text3'] = df['text2'].apply(spell_eng)
#lemmatization/stemming
def stem_eng(text):
lemmatizer = nltk.stem.WordNetLemmatizer()
return [lemmatizer.lemmatize(w,'v') for w in text]
df['text4'] = df['text3'].apply(stem_eng)
Generated output:
Desired output:
text4
--------------
[spell]
[be]
[work,cook,listen]
[study]
I got where the problem is, the dataframes are storing these arrays as a string. So, the lemmatization is not working. Also note that, it is from the spell_eng part.
I have written a solution, which is a slight modification for your code.
import pandas as pd
import nltk
from textblob import TextBlob
import functools
import operator
df = pd.DataFrame({'text': ["spellling", "was", "working cooking listening","studying"]})
#tokenization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
def tokenize(text):
return [w for w in w_tokenizer.tokenize(text)]
df["text2"] = df["text"].apply(tokenize)
# spelling correction
def spell_eng(text):
text = [TextBlob(str(w)).correct() for w in text] #CHANGE
#convert from tuple to str
text = [functools.reduce(operator.add, (w)) for w in text] #CHANGE
return text
df['text3'] = df['text2'].apply(spell_eng)
# lemmatization/stemming
def stem_eng(text):
lemmatizer = nltk.stem.WordNetLemmatizer()
return [lemmatizer.lemmatize(w,'v') for w in text]
df['text4'] = df['text3'].apply(stem_eng)
df['text4']
Hope these things help.
Currently, I get just one row. How can I get all the words? Currently, I have a column of words. The problem in the stemmer. it gives only one row instead of all words.
My purpose is to clean the data and print all words separated by commas.
input: word1,word2,word3,word4,word5 in each row in the column df[tag]
and the output will be a long list with all the values word1,word2,word3,word4,word5,word6,word7....
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
import pandas as pd
import spacy
import pytextrank
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
def Clean_stop_words(data):
#print(stopwords.words('english'))
stop_words=stopwords.words('english')
new_data=""
for word in data:
np.char.lower(word)
if word not in stop_words:
new_data = data + " , " + word
print(new_data)
symbols = "!\"#$%&()*+-./:;<=>?#[\]^_`{|}~\n"
for i in symbols:
new_data = np.char.replace(new_text, i, ' ')
#print(data)
stemmer=PorterStemmer()
new_data=stemmer.stem(word)
#print(new_data)
Clean_stop_words(df["Tag"])
#print(data)
Thank you in advance
Notice -
I decided to clean the special characters with regex, you can change the method if you wish.
Moreover, please look at the apply function of pandas that takes each row and executes the Clean_stop_words function.
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import re
l = ["'word1,wording,w#ord,he##llo,sleeping,don't"]
df = pd.DataFrame(l, columns=['Tag'])
def Clean_stop_words(data):
stemmer = PorterStemmer()
stop_words=stopwords.words('english')
new_data=""
data_split = data.split(',')
for word in data_split:
np.char.lower(word)
word = re.sub('[^A-Za-z0-9]+', '', word)
if word not in stop_words:
stemmer.stem(word)
new_data = new_data + " , " + word
return new_data
df['Tag'] = df['Tag'].apply(Clean_stop_words)
print(df['Tag'])
I have a dataframe
0 i only need uxy to hit 20 eod to make up for a...
1 oh this isn’t good
2 lads why is my account covered in more red ink...
3 i'm tempted to drop my last 800 into some stup...
4 the sell offs will continue until moral improves.
I want to apply NLP for each comment to identify which one is positive and which one is negative.
Here is what I have
import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews
from random import shuffle
from nltk import FreqDist
from nltk.corpus import stopwords
import string
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.tokenize import word_tokenize
df = pd.read_csv("/home/yan/PycharmProjects/pythonProject/comments_binary.csv")
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
words = movie_reviews.words(fileid)
pos_reviews.append(words)
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
words = movie_reviews.words(fileid)
neg_reviews.append(words)
stopwords_english = stopwords.words('english')
def bag_of_words(words):
words_clean = []
for word in words:
word = word.lower()
if word not in stopwords_english and word not in string.punctuation:
words_clean.append(word)
words_dictionary = dict([word, True] for word in words_clean)
return words_dictionary
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
pos_reviews_set.append((bag_of_words(words), 'pos'))
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
neg_reviews_set.append((bag_of_words(words), 'neg'))
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
custom_review = "I am pretty sure that TSLA will hit 500 today after open"
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_words(custom_review_tokens)
print (classifier.classify(custom_review_set)) # Output: pos
I am confused how would I apply the whole function for each row with text and create a separate column with pos and neg text that would describe a certain comment.
I tried to create a function
def my_classification(x):
return classifier.classify(x)
df["new_column"] = df["text"].apply(my_classification)
But it says AttributeError: 'str' object has no attribute 'copy'
I would highly appreciate your help
I am reading a news article and pos-tagging with nltk. I want to remove those lines that does not have a pos tag like CD (numbers).
import io
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
stop_words = set(stopwords.words('english'))
file1 = open("etorg.txt")
line = file1.read()
file1.close()
print(line)
words = line.split()
tokens = nltk.pos_tag(words)
How do I remove all sentences that do not contain the CD tag?
Just use [word for word in tokens if word[1] != 'CD']
EDIT: To get the sentences that have no numbers, use this code:
def has_number(sentence):
for i in nltk.pos_tag(sentence.split()):
if i[1] == 'CD':
return ''
return sentence
line = 'MNC claims 21 million sales in September. However, industry sources do not confirm this data. It is estimated that the reported sales could be in the range of fifteen to 18 million. '
''.join([has_number(x) for x in line.split('.')])
> ' However, industry sources do not confirm this data '
I am doing a classification task on tweets (3 labels= pos, neg, neutral), for which I'm using Naive Bayes in NLTK. I'd like to add in ngrams (bigrams) as well. I have tried adding them to the code, but I don't seem to get where to fit them right in. At the moment it seems as if I'm "breaking" the code, no matter where I add in the bigrams. Could anybody please help me out, or redirect me to a tutorial?
My code for unigrams follows. If you need any information on how the datasets look, I'd be happy to provide it.
import nltk
import csv
import random
import nltk.classify.util, nltk.metrics
import codecs
import re, math, collections, itertools
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk import bigrams
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords = True)
stopset = set(stopwords.words('english'))
stopset.add('username')
stopset.add('url')
stopset.add('percentage')
stopset.add('number')
stopset.add('at_user')
stopset.add('AT_USER')
stopset.add('URL')
stopset.add('percentagenumber')
inpTweets = []
##with open('sanders.csv', 'r', 'utf-8') as f: #input sanders
## reader = csv.reader(f, delimiter = ';')
## for row in reader:
## inpTweets.append((row))
reader = codecs.open('...sanders.csv', 'r', encoding='utf-8-sig') #input classified tweets
for line in reader:
line = line.rstrip()
row = line.split(';')
inpTweets.append((row))
def processTweet(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
tweet = re.sub('#[^\s]+','AT_USER',tweet)
tweet = re.sub('[\s]+', ' ', tweet)
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
tweet = tweet.strip('\'"')
return tweet
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
def preprocessing(doc):
tokens = tokenizer.tokenize(doc)
bla = []
for x in tokens:
if len(x)>2:
if x not in stopset:
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x)
if val is not None:
x = replaceTwoOrMore(x)
x = processTweet(x)
x = x.strip('\'"?,.')
x = stemmer.stem(x).lower()
bla.append(x)
return bla
xyz = []
for lijn in inpTweets:
xyz.append((preprocessing (lijn[0]),lijn[1]))
random.shuffle(xyz)
featureList = []
k = 0
while k in range (0, len(xyz)):
featureList.extend(xyz[k][0])
k = k + 1
fd = nltk.FreqDist(featureList)
featureList = list(fd.keys())[2000:]
def document_features(doc):
features = {}
document_words = set(doc)
for word in featureList:
features['contains(%s)' % word] = (word in document_words)
return features
featuresets = nltk.classify.util.apply_features(document_features, xyz)
training_set, test_set = featuresets[2000:], featuresets[:2000]
classifier = nltk.NaiveBayesClassifier.train(training_set)
Your code uses the 2000 most common words as the classification features. Just select the bigrams you want to use, and convert them to features in document_features(). A feature like "contains (the dog)" will work just like "contains (dog)".
An interesting approach is using a sequential backoff tagger, which allows you to chain taggers together: in this way you could train a n-gram tagger and a Naive Bayes and chain them togheter.