SHAP KernelExplainer error on textual data using pipeline - python

I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.
I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!
Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk
#Load the data
os.chdir('C:\\Users\\Win\\Desktop\\MyLearning\\Explainability\\SHAP')
review = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')
#Clean the data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def clean_text(df_text_column, data):
corpus = []
for i in range(0, len(data)):
text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
text = text.lower()
text = text.split()
ps = PorterStemmer()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)
corpus.append(text)
return corpus
X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
y = review['Liked']
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Creating the pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.pipeline import make_pipeline
np.random.seed(0)
rf_pipe = make_pipeline(vect, rf)
rf_pipe.steps
rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
y_prob = rf_pipe.predict_proba(X_test)
#Performance Metrics
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred) #Accuracy
metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score
# use Kernel SHAP to explain test set predictions
import shap
explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
shap_values = explainer.shap_values(X_test, nsamples=100)
# plot the SHAP values
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")

Related

why smote raise "Found input variables with inconsistent numbers of samples"?

I try to classify emotion from tweet with dataset of 4401 tweet, when i use smaller sample of data (around 15 tweet) everything just work fine, but when i use the full dataset it raise the error of
Found input variables with inconsistent numbers of samples: [7, 3520]
the error happen when i try to oversampling the data using smote after transforming the data using countvectorizer.
This is the code where the error raise
# N-gram Feature and Term Frequency
vectorizer = CountVectorizer(ngram_range=(1,3))
x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray()
x_test_tf = vectorizer.transform(str(x_test).split('\n')).toarray()
df_output = pd.DataFrame(data =x_train_tf, columns = vectorizer.get_feature_names_out())
display(df_output)
# the print shape is (7 rows × 250 columns)
smote = SMOTE(random_state=42, k_neighbors=5)
x_smote, y_smote = smote.fit_resample(x_train_tf, y_train)
print("Total Train Data SMOTE : ",x_smote.shape), print("Total Train Label SMOTE : ",y_smote)
i did not understand why this is happening so some explanation could really help.
i already tried to solve it using answers from other similiar question but nothing have worked.
This is the full code
import nltk
import re
#nltk.download()
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import everygrams
from collections import Counter
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
dataset = pd.read_csv("G:/TA/Program/dataset/Twitter_Emotion_Dataset.csv", encoding='latin-1')
# Preprocessing
dataset['case_folding_tweet'] = dataset['tweet'].str.casefold()
dataset['only_alphabet_tweet'] = [re.sub('[^a-zA-Z]+\s*', ' ', s) for s in dataset['case_folding_tweet']]
dataset['data_cleaning_tweet'] = dataset['only_alphabet_tweet'].str.replace(r'\b\w{1}\b','').str.replace(r'\s+', ' ')
slangword_dictionary = ("G:/TA/Program/dataset/kamus_singkatan.csv")
deslang = {}
list_slangword = open(slangword_dictionary).readlines()
for line in list_slangword:
slang, unslang = line.strip().split(';')
deslang[slang] = unslang
deslang[slang] = {r"\b{}\b".format(k): v for k, v in deslang.items()}
dataset['data_cleaning_tweet'] = dataset['data_cleaning_tweet'].replace(deslang[slang], regex=True)
dataset['convert_slang_tweet'] = dataset['data_cleaning_tweet']
replace_dictionary = {'tidak ': 'tidak', 'bukan ': 'bukan', 'jangan ': 'jangan', 'belum ': 'belum'}
dataset['convert_negation_tweet'] = dataset['convert_slang_tweet'].replace(replace_dictionary, regex=True)
dataset['tokenization_tweet'] = dataset['convert_negation_tweet'].apply(word_tokenize)
list_stopwords = set(stopwords.words("indonesian"))
list_stopwords.add('username')
list_stopwords.add('url')
dataset['stopword_removal_tweet'] = dataset['tokenization_tweet'].apply(lambda x: [item for item in x if item not in list_stopwords])
factory = StemmerFactory()
stemmer = factory.create_stemmer()
dataset['stemmed_tweet'] = dataset['stopword_removal_tweet'].apply(lambda x: [stemmer.stem(y) for y in x])
# Split data
x = dataset["stemmed_tweet"].values
y = dataset["label"].values
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state= 42)
# Get N-gram and TF
vectorizer = CountVectorizer(ngram_range=(1,3))
x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray()
x_test_tf = vectorizer.transform(str(x_test).split('\n')).toarray()
# Oversampling
smote = SMOTE(random_state=42, k_neighbors=5)
x_smote, y_smote = smote.fit_resample(x_train_tf, y_train)
print("Total Train Data SMOTE : ",x_smote.shape), print("Total Train Label SMOTE : ",y_smote)
gnb_classifier = GaussianNB()
gnb_classifier.fit(x_smote, y_smote)
print(gnb_classifier)
y_pred = gnb_classifier.predict(x_test_tf)
print("Emotion Predicted :", y_pred)
Link to the dataset
I cannot solve it precisely because I don't have your data, but here are a few observations which should help:
apparently x_train_tf has only 7 rows? it's not enough for training a model and it's not 80% of 4401, as you're supposed to obtain from train_test_split.
Note that y_train has 3520 rows = 4401 * 80%, the correct number of rows.
I suspect that the line x_train_tf = vectorizer.fit_transform(str(x_train).split('\n')).toarray() is not doing what you think it does. Try to decompose the str(x_train).split('\n') part.
i fix the problem using the answer from this post answer
by joining all the train data column before vectorizing.
df_train = pd.DataFrame(data=x_train)
df_test = pd.DataFrame(data=x_test)
series = pd.Series(df_train['stemmed_tweet'])
corpus = series.apply(lambda series: ' '.join(series))
vectorizer = CountVectorizer(ngram_range=(1,3), lowercase=False)
x_train_tf = vectorizer.fit_transform(corpus).toarray()
x_test_tf = vectorizer.transform(str(df_test.values).split("\n")).toarray()

How to classify new data using a pre-trained model - Python Text Classification (NLTK and Scikit)

I am very new to Text Classification and I am trying to classify each line of a dataset composed by twitter comments according to some pre-defined topics.
I have used the code bellow in Jupyter Notebook to build and train a model with a training dataset. I chose to use a supervised approach in python with NLTK and Scikit, as unsupervised ones (like LDA) were not giving me good results.
I followed these steps so far:
Mannually categorised the topics of a training dataset;
Applied the training dataset to the code bellow and trained it resulting in an accuracy of aprox. 82%.
Now, I want to use this model to automatically categorise the topics of another dataset (i.e., my test dataset). Most posts only cover the training part, so it's quite frustraiting for a newcommer to understand how to get the trained model and actually use it.
Hence, the question is: with the code below, how can I now use the trained model to classify a new dataset?
I appreciate your help.
This tutorial is very good, and I used it as a reference for the code below: https://medium.com/#ishan16.d/text-classification-in-python-with-scikit-learn-and-nltk-891aa2d0ac4b
My model building and training code:
#Do library and methods import
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk as nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import regex as re
import requests
# Import dataset
df = pd.read_csv(r'C:\Users\user_name\Downloads\Train_data.csv', delimiter=';')
# Tokenize
def tokenize(x):
tokenizer = RegexpTokenizer(r'\w+')
return tokenizer.tokenize(x)
df['tokens'] = df['Tweet'].map(tokenize)
# Stem and Lemmatize
nltk.download('wordnet')
nltk.download('omw-1.4')
def stemmer(x):
stemmer = PorterStemmer()
return ' '.join([stemmer.stem(word) for word in x])
def lemmatize(x):
lemmatizer = WordNetLemmatizer()
return ' '.join([lemmatizer.lemmatize(word) for word in x])
df['lemma'] = df['tokens'].map(lemmatize)
df['stems'] = df['tokens'].map(stemmer)
# set up feature matrix and target column
X = df['lemma']
y = df['Topic']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13)
# Create out pipeline with a vectorizer and our naive Bayes classifier
pipe_mnnb = Pipeline(steps = [('tf', TfidfVectorizer()), ('mnnb', MultinomialNB())])
# Create parameter grid
pgrid_mnnb = {
'tf__max_features' : [1000, 2000, 3000],
'tf__stop_words' : ['english', None],
'tf__ngram_range' : [(1,1),(1,2)],
'tf__use_idf' : [True, False],
'mnnb__alpha' : [0.1, 0.5, 1]
}
# Set up the grid search and fit the model
gs_mnnb = GridSearchCV(pipe_mnnb,pgrid_mnnb,cv=5,n_jobs=-1)
gs_mnnb.fit(X_train, y_train)
# Check the score
gs_mnnb.score(X_train, y_train)
gs_mnnb.score(X_test, y_test)
# Check the parameters
gs_mnnb.best_params_
# Get predictions
preds_mnnb = gs_mnnb.predict(X)
df['preds'] = preds_mnnb
# Print resulting dataset
print(df.shape)
df.head()
It seems than after training you just have to do as for your validation step using directly the grid-searcher, which in sklearn library is also used after training as a model taking the best found hyperparameters.
So take a X which is what you want to evaluate and run
preds_mnnb = gs_mnnb.predict(X)
preds_mnnb should contain what you expect

Machine learning Spam Classification

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
dataset = pd.read_csv(r'emails.csv')
dataset.columns #Index(['text', 'spam'], dtype='object')
dataset.shape #(5728, 2)
#Checking for duplicates and removing them
dataset.drop_duplicates(inplace = True)
dataset.shape #(5695, 2)
#Checking for any null entries in the dataset
print (pd.DataFrame(dataset.isnull().sum()))
'''
text 0
spam 0
'''
#Using Natural Language Processing to cleaning the text to make one corpus
# Cleaning the texts
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#Every mail starts with 'Subject :' will remove this from each text
dataset['text']=dataset['text'].map(lambda text: text[1:])
dataset['text'] = dataset['text'].map(lambda text:re.sub('[^a-zA-Z0-9]+', ' ',text)).apply(lambda x: (x.lower()).split())
ps = PorterStemmer()
corpus=dataset['text'].apply(lambda text_list:' '.join(list(map(lambda word:ps.stem(word),(list(filter(lambda text:text not in set(stopwords.words('english')),text_list)))))))
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus.values).toarray()
y = dataset.iloc[:, 1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# Fitting Naive Bayes classifier to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
classifier.fit(X_train , y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
I practiced on no. just today I shifted on the text so I have this model ready with 0.98 accuracies but when I am trying to predict on a new text input I am getting errors.
subject = "hello this is a test"
classifier.predict([[subject]])
the error which I got is
FutureWarning: Beginning in version 0.22, arrays of bytes/strings will be converted to decimal numbers if dtype='numeric'. It is recommended that you convert the array to a float dtype before using it in scikit-learn, for example by using your_array = your_array.astype(np.float64).
return f(**kwargs)
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 29223 is different from 1)
Any suggestions or possible solutions which I can try
I also tried converting the sentence
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = CountVectorizer()
tfidfconverter = TfidfTransformer()
text = "Hello world!"
text = vectorizer.transform([text]).toarray()
text = tfidfconverter.transform(text).toarray()
label = classifier.predict(text)[0]```
but got a NotFittedError: Vocabulary not fitted or provided.

Jupyter Notebook Multiple Cell Issue

I currently have this piece of code:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
####################################################################################
nltk.download('punkt')
nltk.download('stopwords')
dataset = pd.read_csv('car_reviews.csv')
ps = PorterStemmer()
####################################################################################
data = []
for i in range(dataset.shape[0]):
text = dataset.iloc[i, 1]
text = re.sub('[^A-Za-z]', ' ', text)
text = text.lower()
tokenized_text = word_tokenize(text)
processed_text = [ps.stem(word) for word in tokenized_text if word not in set(stopwords.words('english'))]
final_text = " ".join(processed_text)
data.append(final_text)
####################################################################################
matrix = CountVectorizer()
X = matrix.fit_transform(data).toarray()
Y = dataset.iloc[:, 0]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
print('The number of reviews in the training set is: ' + str(len(X_train)) + '.')
print('The number of reviews in the test set is: ' + str(len(X_test)) + '.')
####################################################################################
classifier = MultinomialNB()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
cf_matrix = confusion_matrix(Y_test, Y_pred)
classification_report = classification_report(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
print('Accuracy: %.2f%% ' % (accuracy * 100.0))
The # indicate a new cell. So in total we have 5 cells. When I restart the notebook everything runs fine and I get an output. However, when I just run the last cell of the multinomial Naive Bayes, I get a numpy.ndarray error saying object is not callable for my confusion matrix and I have no clue why. How would I go about fixing this?
It was due to my terrible programming habits and renaming the variables for the confusion_matrix and classification_report the same thing. After changing the variable names, it works fine.

Using Sklearn, Category Predictions not working on the test data

Dataset: I created a very simple dataset of "Supplier", "Item description" columns . This dataset has a list of item descriptions and preferred supplier for that item
Requirement: I would like to write a program that will take an "Item Description" and predict the "Supplier". To keep it very simple, I just have only 5 Unique supplier-Item Description combinations out of the 950 rows in the .txt file
Issue: The accuracy shows up 1 and confusing matrix shows no false positives. But when I give a new data, the prediction is wrong.
Steps Done
Read .txt for "Supplier" and "Item Description"
Label Encoder applied on the "Item Description"
train test and split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)
Created a Pipeline for applying the TfidfVectorizer and MultinomialNB
pipeline = Pipeline([('vect', vectorizer),
('clf', MultinomialNB())
])
model = pipeline.fit(X_train, y_train)
fit model and predict :
y_pred=model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
acc= accuracy_score(y_test,y_pred)
# acc is 1.0 and the cm shows no false positives/negatgives
So far, things look ok
dumped the pickle
pickle.dump(model, open(r'supplier_predictions.pkl','wb'))
Tried prediction on a Item Description= 'Lego, Barbie and other Toy Items' ; I was expecting "Toys R Us"
The prediction was wrong, it came up as "Office Depot".
loadedModel = pickle.load(open("supplier_predictions.pkl","rb"))
new_items = {'ITEM_DESCRIPTION': ['Lego, Barbie and other Toy Items']}
new_X = pd.DataFrame(new_items, columns = ['ITEM_DESCRIPTION'])
new_y_pred=loadedModel.predict(new_X)
Can you please let me know
what I am doing wrong here to get the wrong prediction, new_y_pred for the test item description passed in (new_X)
This is my first ML code. I have tried debugging this by looking at various articles, but no luck.
Thanks
== Complete Code, if it is helpful ==
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import re # librarie for cleaning data
import nltk # library for NLP
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import pickle
df=pd.read_csv('git_suppliers.txt', sep='\t')
# Prep the data - Item Description
from sklearn.feature_extraction.text import TfidfVectorizer
stemmer = PorterStemmer()
words = stopwords.words("english")
df['ITEM_DESCRIPTION'] = df['ITEM_DESCRIPTION'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z0-9]", " ", x).split() if i not in words]).lower())
# Feature Generation using the TF-IDF
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(df['ITEM_DESCRIPTION']).toarray()
final_features.shape
# final_features shows only 43 features - not going to use SelectKBest for such such less features count
#
# Split into training and test data
#
X = df['ITEM_DESCRIPTION']
y = df['SUPPLIER']
from sklearn.preprocessing import LabelEncoder
labelObj = LabelEncoder()
y=labelObj.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)
y_test_decoded=labelObj.inverse_transform(y_test)
#
# Create a pipeline, fit the model, predict for test data and save in pickle
#
pipeline = Pipeline([('vect', vectorizer),
('clf', MultinomialNB())
])
model = pipeline.fit(X_train, y_train)
# Predict for test data
y_pred=model.predict(X_test)
# Accuracy shows up as 1.0 and the confusion matrix shows no false positives/negatives
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
acc= accuracy_score(y_test,y_pred)
print(acc)
# Dump the model and lets predict for one item description,
# for which i expect Toys R Us as the supplier/Seller
pickle.dump(model, open(r'supplier_predictions.pkl','wb'))
loadedModel = pickle.load(open("supplier_predictions.pkl","rb"))
new_items = {'ITEM_DESCRIPTION': ['Lego, Barbie and other Toy Items']}
new_X = pd.DataFrame(new_items, columns = ['ITEM_DESCRIPTION'])
new_y_pred=loadedModel.predict(new_X)
labelObj.inverse_transform(new_y_pred)
### Shows Office Depot
My bad - the input to the predict was wrong type. Passed in a series and it worked fine.
new_items = pd.Series(new_items)
new_y_pred=loadedModel.predict(new_items)
labelObj.inverse_transform(new_y_pred)

Categories

Resources