check sentence against vocabulary - python

I want to get 50 most common words from a corpus and then check if this words are present in the sentences. I want to iterate through all sentences and print vector (0 if word is in the sentence and 1 if not). I wrote this code, but it is showing only 0 (false). Any ideas?
import nltk
from nltk import FreqDist
from nltk.corpus import brown
news = brown.words(categories='news')
news_sents = brown.sents(categories='news')
fdist = FreqDist(w.lower() for w in news)
word_features = list(fdist.values())[:50]
num_sents = len(news.sents(fileid))
for i in range(num_sents):
features = {}
for word in word_features:
features[word] = int(word in news_sents[i])
print(features)

Related

TfidfModel "too many values to unpack" error

I am attempting to cluster a group of phrases using TfidfModel from the gensim python package. I am encountering a problem with input length on TfidfModel, specifically ValueError: too many values to unpack (expected 2)
Here is my code:
import re
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.similarities import MatrixSimilarity
from sklearn.cluster import KMeans
def preprocess_text(text):
# Remove punctuation and make all characters lowercase
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
# Tokenize the text
tokens = simple_preprocess(text)
return tokens
def cluster_phrases(phrases, num_clusters):
# Preprocess the phrases
preprocessed_phrases = [preprocess_text(phrase) for phrase in phrases]
# Create a Tf-Idf model from the preprocessed phrases
tfidf = TfidfModel(preprocessed_phrases)
# Compute the similarity matrix between all the phrases
similarity_matrix = MatrixSimilarity(tfidf[preprocessed_phrases])
# Cluster the phrases using KMeans
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(similarity_matrix)
return kmeans.labels_
phrases = ["the dog jumps high", "the dog jumps above", "The duck quacks", "A duck makes a sound", "the dog hops", "the cat is a dog", "the cat is beautiful"]
cluster_labels = cluster_phrases(phrases, num_clusters=3)
for i, label in enumerate(cluster_labels):
print(f"Phrase '{phrases[i]}' is in Cluster {label}")
I have tried looking at what the correct input would be. Or am I supposed to be giving the model 2 lists?
I also tried joining like so:
# Preprocess the phrases
preprocessed_phrases2 = []
preprocessed_phrases = [preprocess_text(phrase) for phrase in phrases]
for x in preprocessed_phrases:
x = ' '.join(x)
print(x)
preprocessed_phrases2.append(x)
# Create a Tf-Idf model from the preprocessed phrases
tfidf = TfidfModel(preprocessed_phrases2)
Question: How can I get TfidfModel to cluster phrases properly?

Gensim for similarities

I have a dataframe in pandas of organisation descriptions and project titles, shown below:
Columns are df['org_name'], df['org_description'], df['proj_title']. I want to add a column with the similarity score between the organisation description and project title, for each project(each row).
I'm trying to use gensim: https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html. However, I'm not sure how to adapt the tutorial for my use case, because in the tutorial we get a new query doc = "Human computer interaction" and then compared that against the documents in the corpus individually. Not sure where this choice is made (sims? vec_lsi?)
But I want the similarity score for just the two items in a given row of dataframe df, not one of them against the whole corpus, for each row and then append that to df as a column. How can I do this?
Here is an adaptation of the Gensim LSI tutorial, where the description represents a corpus of sentences and the title is the query made against it.
from gensim.models import LsiModel
from collections import defaultdict
from gensim import corpora
def desc_title_sim(desc, title):
# remove common words and tokenize
stoplist = set('for a of the and to in'.split()) # add a longer stoplist here
sents = desc.split('.') # crude sentence tokenizer
texts = [
[word for word in sent.lower().split() if word not in stoplist]
for sent in sents
]
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [
[token for token in text if frequency[token] > 1]
for text in texts
]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
vec_bow = dictionary.doc2bow(title.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
return vec_lsi
Apply the function row-wise to get similarity:
df['sim'] = df.apply(lambda row: desc_title_sim(row['org_description'], row['proj_title']), axis=1)
The newly created sim column will be populated with values like
[(0, 0.4618210045327158), (1, 0.07002766527900064)]

Stemming from a list in python [duplicate]

I have bunch of sentences in a list and I wanted to use nltk library to stem it. I am able to stem one sentence at a time, however I am having issues stemming sentences from a list and joining them back together. Is there a step I am missing? Quite new to nltk library. Thanks!
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
# Success: one sentences at a time
data = 'the gamers playing games'
words = word_tokenize(data)
for w in words:
print(ps.stem(w))
# Fails:
data_list = ['the gamers playing games',
'higher scores',
'sports']
words = word_tokenize(data_list)
for w in words:
print(ps.stem(w))
# Error: TypeError: expected string or bytes-like object
# result should be:
['the gamer play game',
'higher score',
'sport']
You're passing a list to word_tokenize which you can't.
The solution is to wrap your logic in another for-loop,
data_list = ['the gamers playing games','higher scores','sports']
for words in data_list:
words = tokenize.word_tokenize(words)
for w in words:
print(ps.stem(w))
>>>>the
gamer
play
game
higher
score
sport
To stem and recompile back into list data structure, I'd go for:
ps = PorterStemmer()
data_list_s = []
for words in data_list:
words = word_tokenize(words)
words_s = ''
for w in words:
w_s = ps.stem(w)
words_s+=w_s+' '
data_list_s.append(words_s)
This will put the stemmed results of each element from data_list into a new list called data_list_s.
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
sentence = """At eight o'clock on Thursday morning, Arthur didn't feel very good. So i take him to hospital."""
sentence = sentence.lower()
word_tokens = nltk.word_tokenize(sentence)
sent_tokens = sent_tokenize(sentence)
stemmer = PorterStemmer()
stemmed_word = []
stemmed_sent = []
for token in word_tokens:
stemmed_word.append(stemmer.stem(token))
for sent_token in sent_tokens:
stemmed_sent.append(stemmer.stem(sent_token))
print(stemmed_word)
print(stemmed_sent)

python nltk -- stemming list of sentences/phrases

I have bunch of sentences in a list and I wanted to use nltk library to stem it. I am able to stem one sentence at a time, however I am having issues stemming sentences from a list and joining them back together. Is there a step I am missing? Quite new to nltk library. Thanks!
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
# Success: one sentences at a time
data = 'the gamers playing games'
words = word_tokenize(data)
for w in words:
print(ps.stem(w))
# Fails:
data_list = ['the gamers playing games',
'higher scores',
'sports']
words = word_tokenize(data_list)
for w in words:
print(ps.stem(w))
# Error: TypeError: expected string or bytes-like object
# result should be:
['the gamer play game',
'higher score',
'sport']
You're passing a list to word_tokenize which you can't.
The solution is to wrap your logic in another for-loop,
data_list = ['the gamers playing games','higher scores','sports']
for words in data_list:
words = tokenize.word_tokenize(words)
for w in words:
print(ps.stem(w))
>>>>the
gamer
play
game
higher
score
sport
To stem and recompile back into list data structure, I'd go for:
ps = PorterStemmer()
data_list_s = []
for words in data_list:
words = word_tokenize(words)
words_s = ''
for w in words:
w_s = ps.stem(w)
words_s+=w_s+' '
data_list_s.append(words_s)
This will put the stemmed results of each element from data_list into a new list called data_list_s.
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
sentence = """At eight o'clock on Thursday morning, Arthur didn't feel very good. So i take him to hospital."""
sentence = sentence.lower()
word_tokens = nltk.word_tokenize(sentence)
sent_tokens = sent_tokenize(sentence)
stemmer = PorterStemmer()
stemmed_word = []
stemmed_sent = []
for token in word_tokens:
stemmed_word.append(stemmer.stem(token))
for sent_token in sent_tokens:
stemmed_sent.append(stemmer.stem(sent_token))
print(stemmed_word)
print(stemmed_sent)

How to remove stop words using nltk or python

I have a dataset from which I would like to remove stop words.
I used NLTK to get a list of stop words:
from nltk.corpus import stopwords
stopwords.words('english')
Exactly how do I compare the data to the list of stop words, and thus remove the stop words from the data?
from nltk.corpus import stopwords
# ...
filtered_words = [word for word in word_list if word not in stopwords.words('english')]
You could also do a set diff, for example:
list(set(nltk.regexp_tokenize(sentence, pattern, gaps=True)) - set(nltk.corpus.stopwords.words('english')))
To exclude all type of stop-words including nltk stop-words, you could do something like this:
from stop_words import get_stop_words
from nltk.corpus import stopwords
stop_words = list(get_stop_words('en')) #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(nltk_words)
output = [w for w in word_list if not w in stop_words]
I suppose you have a list of words (word_list) from which you want to remove stopwords. You could do something like this:
filtered_word_list = word_list[:] #make a copy of the word_list
for word in word_list: # iterate over word_list
if word in stopwords.words('english'):
filtered_word_list.remove(word) # remove word from filtered_word_list if it is a stopword
There's a very simple light-weight python package stop-words just for this sake.
Fist install the package using:
pip install stop-words
Then you can remove your words in one line using list comprehension:
from stop_words import get_stop_words
filtered_words = [word for word in dataset if word not in get_stop_words('english')]
This package is very light-weight to download (unlike nltk), works for both Python 2 and Python 3 ,and it has stop words for many other languages like:
Arabic
Bulgarian
Catalan
Czech
Danish
Dutch
English
Finnish
French
German
Hungarian
Indonesian
Italian
Norwegian
Polish
Portuguese
Romanian
Russian
Spanish
Swedish
Turkish
Ukrainian
Here is my take on this, in case you want to immediately get the answer into a string (instead of a list of filtered words):
STOPWORDS = set(stopwords.words('english'))
text = ' '.join([word for word in text.split() if word not in STOPWORDS]) # delete stopwords from text
Use textcleaner library to remove stopwords from your data.
Follow this link:https://yugantm.github.io/textcleaner/documentation.html#remove_stpwrds
Follow these steps to do so with this library.
pip install textcleaner
After installing:
import textcleaner as tc
data = tc.document(<file_name>)
#you can also pass list of sentences to the document class constructor.
data.remove_stpwrds() #inplace is set to False by default
Use above code to remove the stop-words.
Although the question is a bit old, here is a new library, which is worth mentioning, that can do extra tasks.
In some cases, you don't want only to remove stop words. Rather, you would want to find the stopwords in the text data and store it in a list so that you can find the noise in the data and make it more interactive.
The library is called 'textfeatures'. You can use it as follows:
! pip install textfeatures
import textfeatures as tf
import pandas as pd
For example, suppose you have the following set of strings:
texts = [
"blue car and blue window",
"black crow in the window",
"i see my reflection in the window"]
df = pd.DataFrame(texts) # Convert to a dataframe
df.columns = ['text'] # give a name to the column
df
Now, call the stopwords() function and pass the parameters you want:
tf.stopwords(df,"text","stopwords") # extract stop words
df[["text","stopwords"]].head() # give names to columns
The result is going to be:
text stopwords
0 blue car and blue window [and]
1 black crow in the window [in, the]
2 i see my reflection in the window [i, my, in, the]
As you can see, the last column has the stop words included in that docoument (record).
you can use this function, you should notice that you need to lower all the words
from nltk.corpus import stopwords
def remove_stopwords(word_list):
processed_word_list = []
for word in word_list:
word = word.lower() # in case they arenet all lower cased
if word not in stopwords.words("english"):
processed_word_list.append(word)
return processed_word_list
using filter:
from nltk.corpus import stopwords
# ...
filtered_words = list(filter(lambda word: word not in stopwords.words('english'), word_list))
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
print(word_tokens)
print(filtered_sentence)
I will show you some example
First I extract the text data from the data frame (twitter_df) to process further as following
from nltk.tokenize import word_tokenize
tweetText = twitter_df['text']
Then to tokenize I use the following method
from nltk.tokenize import word_tokenize
tweetText = tweetText.apply(word_tokenize)
Then, to remove stop words,
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tweetText = tweetText.apply(lambda x:[word for word in x if word not in stop_words])
tweetText.head()
I Think this will help you
In case your data are stored as a Pandas DataFrame, you can use remove_stopwords from textero that use the NLTK stopwords list by default.
import pandas as pd
import texthero as hero
df['text_without_stopwords'] = hero.remove_stopwords(df['text'])

Categories

Resources