ValueError: empty vocabulary - python

I'm new to Python and trying to create a text classification program as part of a piece of work for school..
Using the following code and various (unedited) libraries including NumPy, scikit-learn and others, I keep getting the same error:
Traceback (most recent call last):
File "C:/Users/esg1/Python/Learning Python/stackabuse.com example/MediaBiasDetectionClassification.py", line 49, in <module>
X = vectorizer.fit_transform(documents).toarray()
File "C:\Users\esg1\Python\lib\site-packages\sklearn\feature_extraction\text.py", line 1010, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
File "C:\Users\esg1\Python\lib\site-packages\sklearn\feature_extraction\text.py", line 941, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only contain stop words")
ValueError: empty vocabulary; perhaps the documents only contain stop words
The code I'm working from is:
#importing libraries
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords
#importing the dataset
mediaBias_data = load_files(r"C:\Users\esg1\Desktop\Course\Year 3\Individual Project\Data Gathering")
X, y = mediaBias_data.data, mediaBias_data.target
#text preprocessing
documents = []
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'\W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r'\s+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^b\s+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
#converting text to numbers
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
#finding Term Frequency Inverse Document Frequency (TFIDF)
#TF
#TermFrequency = (Number of Occurrences of a word)/(Total words in the document)
#IDF
#IDF(word) = Log((Total number of documents)/(Number of documents containing the word))
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
#training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#training test classification model and predicting sentiment
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
#predicting
y_pred = classifier.predict(X_test)
#evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
#saving and loading the model
#save
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
#load
with open('text_classifier', 'rb') as training_model:
model = pickle.load(training_model)
#We loaded our trained model and stored it in the model variable.
#Let's predict the sentiment for the test set using our loaded model and see if we can get the same results.
#Execute the following script: y_pred2 = model.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))
Any advice on how to overcome the error would be appreciated!

Related

Jupyter Notebook Multiple Cell Issue

I currently have this piece of code:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
####################################################################################
nltk.download('punkt')
nltk.download('stopwords')
dataset = pd.read_csv('car_reviews.csv')
ps = PorterStemmer()
####################################################################################
data = []
for i in range(dataset.shape[0]):
text = dataset.iloc[i, 1]
text = re.sub('[^A-Za-z]', ' ', text)
text = text.lower()
tokenized_text = word_tokenize(text)
processed_text = [ps.stem(word) for word in tokenized_text if word not in set(stopwords.words('english'))]
final_text = " ".join(processed_text)
data.append(final_text)
####################################################################################
matrix = CountVectorizer()
X = matrix.fit_transform(data).toarray()
Y = dataset.iloc[:, 0]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
print('The number of reviews in the training set is: ' + str(len(X_train)) + '.')
print('The number of reviews in the test set is: ' + str(len(X_test)) + '.')
####################################################################################
classifier = MultinomialNB()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
cf_matrix = confusion_matrix(Y_test, Y_pred)
classification_report = classification_report(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
print('Accuracy: %.2f%% ' % (accuracy * 100.0))
The # indicate a new cell. So in total we have 5 cells. When I restart the notebook everything runs fine and I get an output. However, when I just run the last cell of the multinomial Naive Bayes, I get a numpy.ndarray error saying object is not callable for my confusion matrix and I have no clue why. How would I go about fixing this?
It was due to my terrible programming habits and renaming the variables for the confusion_matrix and classification_report the same thing. After changing the variable names, it works fine.

NLP - Found input variables with inconsistent numbers samples

So I'm trying to train a model to read the greetings from the sample dataset collected from Tripadvisor and I've been getting the following error when I am trying to train the model sets.
Here's the link to the dataset - https://nextit-public.s3-us-west-2.amazonaws.com/rsics.html?fbclid=IwAR0CktLQtuPBaZNk03odCKdrjN3LjYl_ouuFBbWvyj-yQ-BvzJ0v_n9w9xo
Here's my code;
import streamlit as st
import numpy as np
import pandas as pd
# NLP Pkgs
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import os
# Main Stuff
st.title("Greetings NLP - Presence")
st.subheader("Created using Streamlit - Harshil Parikh ")
# Loading the data into streamlit
#st.cache
def load_data(nrows):
#data = pd.read_csv('/Users/harshilparikh/Desktop/INT/data/selections.csv', nrows=nrows)
dataset = st.cache(pd.read_csv)('/Users/harshilparikh/Desktop/INT/data/selections.csv')
return dataset
data_load_state = st.text('Loading data...')
dataset = load_data(1000)
data_load_state.text('Data loaded.')
#Displaying all data first
if st.checkbox('Show Raw data'):
st.subheader('Raw Data')
st.write(dataset)
# GREETING TAB
st.subheader('Greetings')
greet = st.sidebar.multiselect("Select Greeting", dataset['Greeting'].unique())
select = dataset[(dataset['Greeting'].isin(greet))]
# SEPARATING ONLY TWO COLUMNS FROM THE DATA
greet_select = select[['Greeting','Selected']]
select_check= st.checkbox("Display records with greeting")
if select_check:
st.write(greet_select)
#Text- Preprocessing - Range from 0 to 6758 total feedback
nltk.download('stopwords')
corpus = []
for i in range(0, 6758):
review = re.sub('[^a-zA-Z]', '', str(dataset['Selected'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ''.join(review)
corpus.append(review)
#BAG OF WORDS
cv = CountVectorizer(max_features = 6758)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
st.write(X)
st.write(y)
st.write(cv)
#Training sets (800 values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
#X_train[0, 0:10] #First 10 rows of the first column of X_train.
# NLP - Naive Bayes algorithm
classifier = GaussianNB()
classifier.fit(X_train, y_train)
I'm trying to learn simple NPL. Any helps will be appreciated.
Error I'm getting
ValueError: Found input variables with inconsistent numbers of samples: [1, 6759]
Traceback:
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/streamlit/script_runner.py", line 332, in _run_script
exec(code, module.dict)
File "/Users/harshilparikh/Desktop/INT/first.py", line 90, in
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/model_selection/_split.py", line 2127, in train_test_split
arrays = indexable(*arrays)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py", line 292, in indexable
check_consistent_length(*result)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py", line 255, in check_consistent_length
raise ValueError("Found input variables with inconsistent numbers of"
Your error occurs when calling the function train_test_split, x and y need to be of the same length, which is not the case. I suspect that the problem arises at your for-loop. Instead of adding all the reviews to your corpus you only add the last after leaving the for-loop. Try this instead:
for i in range(0, 6758):
review = re.sub('[^a-zA-Z]', '', str(dataset['Selected'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ''.join(review)
corpus.append(review)

How to Make Predictions of Data Using Sklearn's RandomForestClassifier

I followed this website here https://stackabuse.com/text-classification-with-python-and-scikit-learn/ and have successfully completed the model and saved it using my own data, however I don't know how to test a new document on the model. I have a bunch of documents in a string format like so: string = "Whatever and more of whatever" and I just need to know what code I need to run to test these documents through my model. My code is the exact same as the website with the only difference being the files I have loaded and to solve my problem I tried to use classifier.predict(string) and it gave me the error ValueError: could not convert string to float. Any help would be appreciated.
import re
import nltk
from sklearn.datasets import load_files
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
stemmer = WordNetLemmatizer()
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
doctor_data = load_files(r"pathtodata")
X, y = doctor_data.data, doctor_data.target
documents = []
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'\W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r'\s+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^b\s+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
Update:
I tried to convert my document to the proper format using this code
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(MYDOC).toarray()
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
pred = model.predict(X)
print(pred)
And this is the error I got
ValueError: Number of features of the model must match the input. Model n_features is 897 and input n_features is 149

ValueError: too many values to unpack (NLTK classifier)

I'm doing classification analysis using NLTK's Naive Bayes classifier. I insert a tsv file containing records and labels.
But the file doesn't get trained due to an error. Here's my python code
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('tweets.txt', delimiter ='\t', quoting = 3)
dataset.isnull().any()
dataset = dataset.fillna(method='ffill')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,16004):
tweet = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
tweet = tweet.lower()
tweet = tweet.split()
ps = PorterStemmer()
tweet = [ps.stem(word) for word in tweet if not word in
set(stopwords.words('english'))]
tweet = ' '.join(tweet)
corpus.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
random_state = 0)
train_set, test_set = X_train[500:], y_train[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
The error is:
File "C:\Users\HSR\Anaconda2\lib\site-packages\nltk\classify\naivebayes.py", line 194, in train
for featureset, label in labeled_featuresets:
ValueError: too many values to unpack
NLTKClassifier doesn't work like scikit estimators. It requires the X and y both in a single array which is then passed to train().
But in your code, you are only supplying it the X_train and it tries to unpack y from that and hence the error.
The NaiveBayesClassifier requires the input to be a list of tuples where list denotes the training samples and the tuple has the feature dictionary and label inside. Something like:
X = [({feature1:'val11', feature2:'val12' .... }, class1),
({feature1:'val21', feature2:'val22' .... }, class2),
...
... ]
You need to change your input to this format.
feature_names = cv.get_feature_names()
train_set = []
for i, single_sample in enumerate(X):
single_feature_dict = {}
for j, single_feature in enumerate(single_sample):
single_feature_dict[feature_names[j]]=single_feature
train_set.append((single_feature_dict, y[i]))
Note: The above for loop can be shortened by using dict comprehension but I'm not that fluent there.
Then you can do this:
nltk.NaiveBayesClassifier.train(train_set)

How can I use word2vec to train a classifier?

The code is used to generate word2vec and use it to train the naive Bayes classifier.
I am able to generate word2vec and use the similarity functions successfully.As a next step I would want to use the word2vec to train the naive bayes classifier. Currently the code given an error when I am trying to slit the data in test and training. How do i convert word2vec model into the array so that it can be used as training data.
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gensim
# Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
# for word2vec we want an array of vectors
corpus.append(review)
#print(corpus)
X = gensim.models.Word2Vec(corpus, min_count=1,size=1000)
#print (X.most_similar("love"))
#embedding_matrix = np.zeros(len(X.wv.vocab), dtype='float32')
#for i in range(len(X.wv.vocab)):
# embedding_vector = X.wv[X.wv.index2word[i]]
# if embedding_vector is not None:
# embedding_matrix[i] = embedding_vector
# Creating the Bag of Words model
#from sklearn.feature_extraction.text import CountVectorizer
#cv = CountVectorizer(max_features = 1500)
#X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
It gives an error on line -
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
TypeError: Expected sequence or array-like, got <class 'gensim.models.word2vec.Word2Vec'>
Word2Vec provides word embeddings only. If you want to characterize documents by embeddings, you'll need to perform an averaging/summing/max operation on embeddings of all words from each document to have a D-dimensional vector that can be used for classification. See here and there for further information on this.
Otherwise, you can use Doc2Vec model to directly produce document embeddings, for which gensim also gives a very good provider.
You have vectors for each word, now you have two approaches to move forward, one could be simply take average of all the words in a sentence to find the sentence vector, another could be to use tfidf.
I implemented the average approach in one of my ongoing projects and i am sharing the github link, please go to the heading "text vectorization(word2vec)" and you will find the code their.
https://github.com/abhibhargav29/SentimentAnalysis/blob/master/SentimentAnalysis.ipynb. I would however suggest you to read data cleaning before as well to understand it in a better way.
One important advice: Do not split the data into train, cv, test after vectorization, do it before vectorization or you will overfit the model.

Categories

Resources