How to Make Predictions of Data Using Sklearn's RandomForestClassifier - python

I followed this website here https://stackabuse.com/text-classification-with-python-and-scikit-learn/ and have successfully completed the model and saved it using my own data, however I don't know how to test a new document on the model. I have a bunch of documents in a string format like so: string = "Whatever and more of whatever" and I just need to know what code I need to run to test these documents through my model. My code is the exact same as the website with the only difference being the files I have loaded and to solve my problem I tried to use classifier.predict(string) and it gave me the error ValueError: could not convert string to float. Any help would be appreciated.
import re
import nltk
from sklearn.datasets import load_files
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
stemmer = WordNetLemmatizer()
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
doctor_data = load_files(r"pathtodata")
X, y = doctor_data.data, doctor_data.target
documents = []
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'\W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r'\s+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^b\s+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
Update:
I tried to convert my document to the proper format using this code
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(MYDOC).toarray()
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
pred = model.predict(X)
print(pred)
And this is the error I got
ValueError: Number of features of the model must match the input. Model n_features is 897 and input n_features is 149

Related

Machine learning Spam Classification

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
dataset = pd.read_csv(r'emails.csv')
dataset.columns #Index(['text', 'spam'], dtype='object')
dataset.shape #(5728, 2)
#Checking for duplicates and removing them
dataset.drop_duplicates(inplace = True)
dataset.shape #(5695, 2)
#Checking for any null entries in the dataset
print (pd.DataFrame(dataset.isnull().sum()))
'''
text 0
spam 0
'''
#Using Natural Language Processing to cleaning the text to make one corpus
# Cleaning the texts
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#Every mail starts with 'Subject :' will remove this from each text
dataset['text']=dataset['text'].map(lambda text: text[1:])
dataset['text'] = dataset['text'].map(lambda text:re.sub('[^a-zA-Z0-9]+', ' ',text)).apply(lambda x: (x.lower()).split())
ps = PorterStemmer()
corpus=dataset['text'].apply(lambda text_list:' '.join(list(map(lambda word:ps.stem(word),(list(filter(lambda text:text not in set(stopwords.words('english')),text_list)))))))
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus.values).toarray()
y = dataset.iloc[:, 1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# Fitting Naive Bayes classifier to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
classifier.fit(X_train , y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
I practiced on no. just today I shifted on the text so I have this model ready with 0.98 accuracies but when I am trying to predict on a new text input I am getting errors.
subject = "hello this is a test"
classifier.predict([[subject]])
the error which I got is
FutureWarning: Beginning in version 0.22, arrays of bytes/strings will be converted to decimal numbers if dtype='numeric'. It is recommended that you convert the array to a float dtype before using it in scikit-learn, for example by using your_array = your_array.astype(np.float64).
return f(**kwargs)
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 29223 is different from 1)
Any suggestions or possible solutions which I can try
I also tried converting the sentence
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = CountVectorizer()
tfidfconverter = TfidfTransformer()
text = "Hello world!"
text = vectorizer.transform([text]).toarray()
text = tfidfconverter.transform(text).toarray()
label = classifier.predict(text)[0]```
but got a NotFittedError: Vocabulary not fitted or provided.

Jupyter Notebook Multiple Cell Issue

I currently have this piece of code:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
####################################################################################
nltk.download('punkt')
nltk.download('stopwords')
dataset = pd.read_csv('car_reviews.csv')
ps = PorterStemmer()
####################################################################################
data = []
for i in range(dataset.shape[0]):
text = dataset.iloc[i, 1]
text = re.sub('[^A-Za-z]', ' ', text)
text = text.lower()
tokenized_text = word_tokenize(text)
processed_text = [ps.stem(word) for word in tokenized_text if word not in set(stopwords.words('english'))]
final_text = " ".join(processed_text)
data.append(final_text)
####################################################################################
matrix = CountVectorizer()
X = matrix.fit_transform(data).toarray()
Y = dataset.iloc[:, 0]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
print('The number of reviews in the training set is: ' + str(len(X_train)) + '.')
print('The number of reviews in the test set is: ' + str(len(X_test)) + '.')
####################################################################################
classifier = MultinomialNB()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
cf_matrix = confusion_matrix(Y_test, Y_pred)
classification_report = classification_report(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
print('Accuracy: %.2f%% ' % (accuracy * 100.0))
The # indicate a new cell. So in total we have 5 cells. When I restart the notebook everything runs fine and I get an output. However, when I just run the last cell of the multinomial Naive Bayes, I get a numpy.ndarray error saying object is not callable for my confusion matrix and I have no clue why. How would I go about fixing this?
It was due to my terrible programming habits and renaming the variables for the confusion_matrix and classification_report the same thing. After changing the variable names, it works fine.

NLP - Found input variables with inconsistent numbers samples

So I'm trying to train a model to read the greetings from the sample dataset collected from Tripadvisor and I've been getting the following error when I am trying to train the model sets.
Here's the link to the dataset - https://nextit-public.s3-us-west-2.amazonaws.com/rsics.html?fbclid=IwAR0CktLQtuPBaZNk03odCKdrjN3LjYl_ouuFBbWvyj-yQ-BvzJ0v_n9w9xo
Here's my code;
import streamlit as st
import numpy as np
import pandas as pd
# NLP Pkgs
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import os
# Main Stuff
st.title("Greetings NLP - Presence")
st.subheader("Created using Streamlit - Harshil Parikh ")
# Loading the data into streamlit
#st.cache
def load_data(nrows):
#data = pd.read_csv('/Users/harshilparikh/Desktop/INT/data/selections.csv', nrows=nrows)
dataset = st.cache(pd.read_csv)('/Users/harshilparikh/Desktop/INT/data/selections.csv')
return dataset
data_load_state = st.text('Loading data...')
dataset = load_data(1000)
data_load_state.text('Data loaded.')
#Displaying all data first
if st.checkbox('Show Raw data'):
st.subheader('Raw Data')
st.write(dataset)
# GREETING TAB
st.subheader('Greetings')
greet = st.sidebar.multiselect("Select Greeting", dataset['Greeting'].unique())
select = dataset[(dataset['Greeting'].isin(greet))]
# SEPARATING ONLY TWO COLUMNS FROM THE DATA
greet_select = select[['Greeting','Selected']]
select_check= st.checkbox("Display records with greeting")
if select_check:
st.write(greet_select)
#Text- Preprocessing - Range from 0 to 6758 total feedback
nltk.download('stopwords')
corpus = []
for i in range(0, 6758):
review = re.sub('[^a-zA-Z]', '', str(dataset['Selected'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ''.join(review)
corpus.append(review)
#BAG OF WORDS
cv = CountVectorizer(max_features = 6758)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
st.write(X)
st.write(y)
st.write(cv)
#Training sets (800 values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
#X_train[0, 0:10] #First 10 rows of the first column of X_train.
# NLP - Naive Bayes algorithm
classifier = GaussianNB()
classifier.fit(X_train, y_train)
I'm trying to learn simple NPL. Any helps will be appreciated.
Error I'm getting
ValueError: Found input variables with inconsistent numbers of samples: [1, 6759]
Traceback:
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/streamlit/script_runner.py", line 332, in _run_script
exec(code, module.dict)
File "/Users/harshilparikh/Desktop/INT/first.py", line 90, in
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/model_selection/_split.py", line 2127, in train_test_split
arrays = indexable(*arrays)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py", line 292, in indexable
check_consistent_length(*result)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py", line 255, in check_consistent_length
raise ValueError("Found input variables with inconsistent numbers of"
Your error occurs when calling the function train_test_split, x and y need to be of the same length, which is not the case. I suspect that the problem arises at your for-loop. Instead of adding all the reviews to your corpus you only add the last after leaving the for-loop. Try this instead:
for i in range(0, 6758):
review = re.sub('[^a-zA-Z]', '', str(dataset['Selected'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ''.join(review)
corpus.append(review)

ValueError: empty vocabulary

I'm new to Python and trying to create a text classification program as part of a piece of work for school..
Using the following code and various (unedited) libraries including NumPy, scikit-learn and others, I keep getting the same error:
Traceback (most recent call last):
File "C:/Users/esg1/Python/Learning Python/stackabuse.com example/MediaBiasDetectionClassification.py", line 49, in <module>
X = vectorizer.fit_transform(documents).toarray()
File "C:\Users\esg1\Python\lib\site-packages\sklearn\feature_extraction\text.py", line 1010, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
File "C:\Users\esg1\Python\lib\site-packages\sklearn\feature_extraction\text.py", line 941, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only contain stop words")
ValueError: empty vocabulary; perhaps the documents only contain stop words
The code I'm working from is:
#importing libraries
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords
#importing the dataset
mediaBias_data = load_files(r"C:\Users\esg1\Desktop\Course\Year 3\Individual Project\Data Gathering")
X, y = mediaBias_data.data, mediaBias_data.target
#text preprocessing
documents = []
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'\W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r'\s+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^b\s+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
#converting text to numbers
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
#finding Term Frequency Inverse Document Frequency (TFIDF)
#TF
#TermFrequency = (Number of Occurrences of a word)/(Total words in the document)
#IDF
#IDF(word) = Log((Total number of documents)/(Number of documents containing the word))
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
#training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#training test classification model and predicting sentiment
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
#predicting
y_pred = classifier.predict(X_test)
#evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
#saving and loading the model
#save
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
#load
with open('text_classifier', 'rb') as training_model:
model = pickle.load(training_model)
#We loaded our trained model and stored it in the model variable.
#Let's predict the sentiment for the test set using our loaded model and see if we can get the same results.
#Execute the following script: y_pred2 = model.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))
Any advice on how to overcome the error would be appreciated!

ValueError: too many values to unpack (NLTK classifier)

I'm doing classification analysis using NLTK's Naive Bayes classifier. I insert a tsv file containing records and labels.
But the file doesn't get trained due to an error. Here's my python code
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('tweets.txt', delimiter ='\t', quoting = 3)
dataset.isnull().any()
dataset = dataset.fillna(method='ffill')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,16004):
tweet = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
tweet = tweet.lower()
tweet = tweet.split()
ps = PorterStemmer()
tweet = [ps.stem(word) for word in tweet if not word in
set(stopwords.words('english'))]
tweet = ' '.join(tweet)
corpus.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
random_state = 0)
train_set, test_set = X_train[500:], y_train[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
The error is:
File "C:\Users\HSR\Anaconda2\lib\site-packages\nltk\classify\naivebayes.py", line 194, in train
for featureset, label in labeled_featuresets:
ValueError: too many values to unpack
NLTKClassifier doesn't work like scikit estimators. It requires the X and y both in a single array which is then passed to train().
But in your code, you are only supplying it the X_train and it tries to unpack y from that and hence the error.
The NaiveBayesClassifier requires the input to be a list of tuples where list denotes the training samples and the tuple has the feature dictionary and label inside. Something like:
X = [({feature1:'val11', feature2:'val12' .... }, class1),
({feature1:'val21', feature2:'val22' .... }, class2),
...
... ]
You need to change your input to this format.
feature_names = cv.get_feature_names()
train_set = []
for i, single_sample in enumerate(X):
single_feature_dict = {}
for j, single_feature in enumerate(single_sample):
single_feature_dict[feature_names[j]]=single_feature
train_set.append((single_feature_dict, y[i]))
Note: The above for loop can be shortened by using dict comprehension but I'm not that fluent there.
Then you can do this:
nltk.NaiveBayesClassifier.train(train_set)

Categories

Resources