I'm doing classification analysis using NLTK's Naive Bayes classifier. I insert a tsv file containing records and labels.
But the file doesn't get trained due to an error. Here's my python code
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('tweets.txt', delimiter ='\t', quoting = 3)
dataset.isnull().any()
dataset = dataset.fillna(method='ffill')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,16004):
tweet = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
tweet = tweet.lower()
tweet = tweet.split()
ps = PorterStemmer()
tweet = [ps.stem(word) for word in tweet if not word in
set(stopwords.words('english'))]
tweet = ' '.join(tweet)
corpus.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
random_state = 0)
train_set, test_set = X_train[500:], y_train[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
The error is:
File "C:\Users\HSR\Anaconda2\lib\site-packages\nltk\classify\naivebayes.py", line 194, in train
for featureset, label in labeled_featuresets:
ValueError: too many values to unpack
NLTKClassifier doesn't work like scikit estimators. It requires the X and y both in a single array which is then passed to train().
But in your code, you are only supplying it the X_train and it tries to unpack y from that and hence the error.
The NaiveBayesClassifier requires the input to be a list of tuples where list denotes the training samples and the tuple has the feature dictionary and label inside. Something like:
X = [({feature1:'val11', feature2:'val12' .... }, class1),
({feature1:'val21', feature2:'val22' .... }, class2),
...
... ]
You need to change your input to this format.
feature_names = cv.get_feature_names()
train_set = []
for i, single_sample in enumerate(X):
single_feature_dict = {}
for j, single_feature in enumerate(single_sample):
single_feature_dict[feature_names[j]]=single_feature
train_set.append((single_feature_dict, y[i]))
Note: The above for loop can be shortened by using dict comprehension but I'm not that fluent there.
Then you can do this:
nltk.NaiveBayesClassifier.train(train_set)
Related
I am working on a LogisticRegressing tex classifier. The classifier's job is to label data as spam or ham.
Initially I have 1 feature(just the text), but then later I am adding 3 more features:
The length of document (number of characters)
Number of digits per document from the document
Number of non-word characters from the document
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
import re
from varname import nameof
##-----------------------------------------------------------------------------
#
def add_feature(X, feature_to_add):
X_modified = hstack([X, csr_matrix(feature_to_add).T], 'csr')
return(X_modified)
##-----------------------------------------------------------------------------
#
def feature_extractor(series_data):
series_doc_len = []
series_digits = []
series_non_alphas = []
entry = 0
for (idx, text) in enumerate(series_data):
text_length = (len(text))
text_digits = sum(c.isdigit() for c in text)
text_non_alphas = re.findall(r'\W+', text)
text_non_alphas_count = len(text_non_alphas)
series_doc_len.append(text_length)
series_digits.append(text_digits)
series_non_alphas.append(text_non_alphas_count)
series_doc_len_series = pd.Series(series_doc_len)
series_digits_series = pd.Series(series_digits)
series_non_alphas_series = pd.Series(series_non_alphas)
series_doc_len_renamed = series_doc_len_series.rename('length_of_doc')
series_digits_renamed = series_digits_series.rename('digit_count')
series_non_alphas_renamed = series_non_alphas_series.rename('non_word_char_count')
return(series_doc_len_renamed, series_digits_renamed, series_non_alphas_renamed)
##-----------------------------------------------------------------------------
#
def load_csv_data(file_name):
spam_data_df = pd.read_csv(file_name)
spam_data_df['target'] = np.where(spam_data_df['target']=='spam',1,0)
X_train, X_test, y_train, y_test = train_test_split(spam_data_df['text'],
spam_data_df['target'],
test_size=0.3,
random_state=0)
return(X_train, X_test, y_train, y_test)
##-----------------------------------------------------------------------------
file_name = "../data/spam-dummy.csv"
X_train, X_test, y_train, y_test = load_csv_data(file_name)
vectorizer = CountVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb')
X_train_vectorized = vectorizer.fit_transform(X_train)
(X_train_doclen, X_train_numdigits, X_train_nonalpha) = feature_extractor(X_train)
for feature in (X_train_doclen, X_train_numdigits, X_train_nonalpha):
X_train_vectorized = add_feature(X_train_vectorized, feature)
X_test_vectorized = vectorizer.transform(X_test)
(X_test_doclen, X_test_numdigits, X_test_nonalpha) = feature_extractor(X_test)
for feature in (X_test_doclen, X_test_numdigits, X_test_nonalpha):
X_test_vectorized = add_feature(X_test_vectorized, feature)
classifier = LogisticRegression(C=100, solver='liblinear')
classifier.fit(X_train_vectorized, y_train)
y_predicted = classifier.predict(X_test_vectorized)
feature_names = np.array(vectorizer.get_feature_names_out() + ['length_of_doc', 'digit_count', 'non_word_char_count'])
sorted_coef_index = classifier.coef_[0].argsort()
smallest = feature_names[sorted_coef_index[:10]]
largest = feature_names[sorted_coef_index[:-11:-1]]
After running the prediction, I am trying to pull smallest/largest coefficients from the model, including the additional three features along with their names.
File "/Users/ukhan/Development/github/education.git/coursera/applied_text_mining_in_python/labs/lab-3/supplimental/code/tfidf-kavitha.py", line 92, in <module>
feature_names = np.array(vectorizer.get_feature_names_out() + ['length_of_doc', 'digit_count', 'non_word_char_count'])
ValueError: operands could not be broadcast together with shapes (15569,) (3,)
What is the correct way to approach this?
I then added the following code to see if the feature-name I added was actually there, but don't see them:
feature_names = np.array(vectorizer.get_feature_names_out())
for feature_name in feature_names:
print(f" Inspecting feature: {feature_name}")
if(feature_name == 'length_of_doc'):
print(f' Feature name: {feature_name} has been found')
elif(feature_name == 'digit_count'):
print(f' Feature name: {feature_name} has been found')
elif(feature_name == 'non_word_char_count'):
print(f' Feature name: {feature_name} has been found')
``
I try to classify emotion from tweet with dataset of 4401 tweet, when i use smaller sample of data (around 15 tweet) everything just work fine, but when i use the full dataset it raise the error of
Found input variables with inconsistent numbers of samples: [7, 3520]
the error happen when i try to oversampling the data using smote after transforming the data using countvectorizer.
This is the code where the error raise
# N-gram Feature and Term Frequency
vectorizer = CountVectorizer(ngram_range=(1,3))
x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray()
x_test_tf = vectorizer.transform(str(x_test).split('\n')).toarray()
df_output = pd.DataFrame(data =x_train_tf, columns = vectorizer.get_feature_names_out())
display(df_output)
# the print shape is (7 rows × 250 columns)
smote = SMOTE(random_state=42, k_neighbors=5)
x_smote, y_smote = smote.fit_resample(x_train_tf, y_train)
print("Total Train Data SMOTE : ",x_smote.shape), print("Total Train Label SMOTE : ",y_smote)
i did not understand why this is happening so some explanation could really help.
i already tried to solve it using answers from other similiar question but nothing have worked.
This is the full code
import nltk
import re
#nltk.download()
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import everygrams
from collections import Counter
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
dataset = pd.read_csv("G:/TA/Program/dataset/Twitter_Emotion_Dataset.csv", encoding='latin-1')
# Preprocessing
dataset['case_folding_tweet'] = dataset['tweet'].str.casefold()
dataset['only_alphabet_tweet'] = [re.sub('[^a-zA-Z]+\s*', ' ', s) for s in dataset['case_folding_tweet']]
dataset['data_cleaning_tweet'] = dataset['only_alphabet_tweet'].str.replace(r'\b\w{1}\b','').str.replace(r'\s+', ' ')
slangword_dictionary = ("G:/TA/Program/dataset/kamus_singkatan.csv")
deslang = {}
list_slangword = open(slangword_dictionary).readlines()
for line in list_slangword:
slang, unslang = line.strip().split(';')
deslang[slang] = unslang
deslang[slang] = {r"\b{}\b".format(k): v for k, v in deslang.items()}
dataset['data_cleaning_tweet'] = dataset['data_cleaning_tweet'].replace(deslang[slang], regex=True)
dataset['convert_slang_tweet'] = dataset['data_cleaning_tweet']
replace_dictionary = {'tidak ': 'tidak', 'bukan ': 'bukan', 'jangan ': 'jangan', 'belum ': 'belum'}
dataset['convert_negation_tweet'] = dataset['convert_slang_tweet'].replace(replace_dictionary, regex=True)
dataset['tokenization_tweet'] = dataset['convert_negation_tweet'].apply(word_tokenize)
list_stopwords = set(stopwords.words("indonesian"))
list_stopwords.add('username')
list_stopwords.add('url')
dataset['stopword_removal_tweet'] = dataset['tokenization_tweet'].apply(lambda x: [item for item in x if item not in list_stopwords])
factory = StemmerFactory()
stemmer = factory.create_stemmer()
dataset['stemmed_tweet'] = dataset['stopword_removal_tweet'].apply(lambda x: [stemmer.stem(y) for y in x])
# Split data
x = dataset["stemmed_tweet"].values
y = dataset["label"].values
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state= 42)
# Get N-gram and TF
vectorizer = CountVectorizer(ngram_range=(1,3))
x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray()
x_test_tf = vectorizer.transform(str(x_test).split('\n')).toarray()
# Oversampling
smote = SMOTE(random_state=42, k_neighbors=5)
x_smote, y_smote = smote.fit_resample(x_train_tf, y_train)
print("Total Train Data SMOTE : ",x_smote.shape), print("Total Train Label SMOTE : ",y_smote)
gnb_classifier = GaussianNB()
gnb_classifier.fit(x_smote, y_smote)
print(gnb_classifier)
y_pred = gnb_classifier.predict(x_test_tf)
print("Emotion Predicted :", y_pred)
Link to the dataset
I cannot solve it precisely because I don't have your data, but here are a few observations which should help:
apparently x_train_tf has only 7 rows? it's not enough for training a model and it's not 80% of 4401, as you're supposed to obtain from train_test_split.
Note that y_train has 3520 rows = 4401 * 80%, the correct number of rows.
I suspect that the line x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray() is not doing what you think it does. Try to decompose the str(x_train).split('\n') part.
i fix the problem using the answer from this post answer
by joining all the train data column before vectorizing.
df_train = pd.DataFrame(data=x_train)
df_test = pd.DataFrame(data=x_test)
series = pd.Series(df_train['stemmed_tweet'])
corpus = series.apply(lambda series: ' '.join(series))
vectorizer = CountVectorizer(ngram_range=(1,3), lowercase=False)
x_train_tf = vectorizer.fit_transform(corpus).toarray()
x_test_tf = vectorizer.transform(str(df_test.values).split("\n")).toarray()
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
dataset = pd.read_csv(r'emails.csv')
dataset.columns #Index(['text', 'spam'], dtype='object')
dataset.shape #(5728, 2)
#Checking for duplicates and removing them
dataset.drop_duplicates(inplace = True)
dataset.shape #(5695, 2)
#Checking for any null entries in the dataset
print (pd.DataFrame(dataset.isnull().sum()))
'''
text 0
spam 0
'''
#Using Natural Language Processing to cleaning the text to make one corpus
# Cleaning the texts
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#Every mail starts with 'Subject :' will remove this from each text
dataset['text']=dataset['text'].map(lambda text: text[1:])
dataset['text'] = dataset['text'].map(lambda text:re.sub('[^a-zA-Z0-9]+', ' ',text)).apply(lambda x: (x.lower()).split())
ps = PorterStemmer()
corpus=dataset['text'].apply(lambda text_list:' '.join(list(map(lambda word:ps.stem(word),(list(filter(lambda text:text not in set(stopwords.words('english')),text_list)))))))
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus.values).toarray()
y = dataset.iloc[:, 1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# Fitting Naive Bayes classifier to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
classifier.fit(X_train , y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
I practiced on no. just today I shifted on the text so I have this model ready with 0.98 accuracies but when I am trying to predict on a new text input I am getting errors.
subject = "hello this is a test"
classifier.predict([[subject]])
the error which I got is
FutureWarning: Beginning in version 0.22, arrays of bytes/strings will be converted to decimal numbers if dtype='numeric'. It is recommended that you convert the array to a float dtype before using it in scikit-learn, for example by using your_array = your_array.astype(np.float64).
return f(**kwargs)
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 29223 is different from 1)
Any suggestions or possible solutions which I can try
I also tried converting the sentence
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = CountVectorizer()
tfidfconverter = TfidfTransformer()
text = "Hello world!"
text = vectorizer.transform([text]).toarray()
text = tfidfconverter.transform(text).toarray()
label = classifier.predict(text)[0]```
but got a NotFittedError: Vocabulary not fitted or provided.
So I'm trying to train a model to read the greetings from the sample dataset collected from Tripadvisor and I've been getting the following error when I am trying to train the model sets.
Here's the link to the dataset - https://nextit-public.s3-us-west-2.amazonaws.com/rsics.html?fbclid=IwAR0CktLQtuPBaZNk03odCKdrjN3LjYl_ouuFBbWvyj-yQ-BvzJ0v_n9w9xo
Here's my code;
import streamlit as st
import numpy as np
import pandas as pd
# NLP Pkgs
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import os
# Main Stuff
st.title("Greetings NLP - Presence")
st.subheader("Created using Streamlit - Harshil Parikh ")
# Loading the data into streamlit
#st.cache
def load_data(nrows):
#data = pd.read_csv('/Users/harshilparikh/Desktop/INT/data/selections.csv', nrows=nrows)
dataset = st.cache(pd.read_csv)('/Users/harshilparikh/Desktop/INT/data/selections.csv')
return dataset
data_load_state = st.text('Loading data...')
dataset = load_data(1000)
data_load_state.text('Data loaded.')
#Displaying all data first
if st.checkbox('Show Raw data'):
st.subheader('Raw Data')
st.write(dataset)
# GREETING TAB
st.subheader('Greetings')
greet = st.sidebar.multiselect("Select Greeting", dataset['Greeting'].unique())
select = dataset[(dataset['Greeting'].isin(greet))]
# SEPARATING ONLY TWO COLUMNS FROM THE DATA
greet_select = select[['Greeting','Selected']]
select_check= st.checkbox("Display records with greeting")
if select_check:
st.write(greet_select)
#Text- Preprocessing - Range from 0 to 6758 total feedback
nltk.download('stopwords')
corpus = []
for i in range(0, 6758):
review = re.sub('[^a-zA-Z]', '', str(dataset['Selected'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ''.join(review)
corpus.append(review)
#BAG OF WORDS
cv = CountVectorizer(max_features = 6758)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
st.write(X)
st.write(y)
st.write(cv)
#Training sets (800 values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
#X_train[0, 0:10] #First 10 rows of the first column of X_train.
# NLP - Naive Bayes algorithm
classifier = GaussianNB()
classifier.fit(X_train, y_train)
I'm trying to learn simple NPL. Any helps will be appreciated.
Error I'm getting
ValueError: Found input variables with inconsistent numbers of samples: [1, 6759]
Traceback:
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/streamlit/script_runner.py", line 332, in _run_script
exec(code, module.dict)
File "/Users/harshilparikh/Desktop/INT/first.py", line 90, in
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/model_selection/_split.py", line 2127, in train_test_split
arrays = indexable(*arrays)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py", line 292, in indexable
check_consistent_length(*result)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py", line 255, in check_consistent_length
raise ValueError("Found input variables with inconsistent numbers of"
Your error occurs when calling the function train_test_split, x and y need to be of the same length, which is not the case. I suspect that the problem arises at your for-loop. Instead of adding all the reviews to your corpus you only add the last after leaving the for-loop. Try this instead:
for i in range(0, 6758):
review = re.sub('[^a-zA-Z]', '', str(dataset['Selected'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ''.join(review)
corpus.append(review)
I followed this website here https://stackabuse.com/text-classification-with-python-and-scikit-learn/ and have successfully completed the model and saved it using my own data, however I don't know how to test a new document on the model. I have a bunch of documents in a string format like so: string = "Whatever and more of whatever" and I just need to know what code I need to run to test these documents through my model. My code is the exact same as the website with the only difference being the files I have loaded and to solve my problem I tried to use classifier.predict(string) and it gave me the error ValueError: could not convert string to float. Any help would be appreciated.
import re
import nltk
from sklearn.datasets import load_files
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
stemmer = WordNetLemmatizer()
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
doctor_data = load_files(r"pathtodata")
X, y = doctor_data.data, doctor_data.target
documents = []
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'\W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r'\s+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^b\s+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
Update:
I tried to convert my document to the proper format using this code
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(MYDOC).toarray()
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
pred = model.predict(X)
print(pred)
And this is the error I got
ValueError: Number of features of the model must match the input. Model n_features is 897 and input n_features is 149