Error with model_evaluation_utils confusion matrix - python

I am working on this tutorial https://medium.com/#sarahcy/read-this-how-winners-create-life-changing-habits-that-actually-work-atomic-habits-by-james-ac7a3c6df911, I am currently trying to run the model evaluation part
class_labels = list(set(labels))
meu.display_model_performance_metrics(true_labels=y_test, predicted_labels=predictions, classes=class_labels)
I get this error
Prediction Confusion Matrix:
------------------------------
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/blabla/XAI/model_evaluation_utils.py", line 87, in display_model_performance_metrics
classes=classes)
File "/blabla/XAI/model_evaluation_utils.py", line 62, in display_confusion_matrix
labels=level_labels),
TypeError: __new__() got an unexpected keyword argument 'labels'
How can I solve this issue?
BR
Edit:
Here is the code upto the problematic line:
# part1
import pandas as pd
import numpy as np
import model_evaluation_utils as meu
import matplotlib.pyplot as plt
from collections import Counter
import shap
import eli5
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
shap.initjs()
#part 2
data, labels = shap.datasets.adult(display=True)
labels = np.array([int(label) for label in labels])
print(data.shape , labels.shape)
data.head()
#part 3
Counter(labels)
#part 4
cat_cols = data.select_dtypes(['category']).columns
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
data.head()
#part 5
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)
X_train.head(3)
data_disp, labels_disp = shap.datasets.adult(display=True)
X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split(data_disp, labels_disp, test_size=0.3, random_state=42)
print(X_train_disp.shape, X_test_disp.shape)
X_train_disp.head(3)
#part 6
import xgboost as xgb
xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5,
objective='binary:logistic', random_state=42)
xgc.fit(X_train, y_train)
#part 7
predictions = xgc.predict(X_test)
predictions[:10]
#part 8
class_labels = list(set(labels))
meu.display_model_performance_metrics(true_labels=y_test, predicted_labels=predictions, classes=class_labels)

remove this function:
def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
total_classes = len(classes)
level_labels = [total_classes*[0], list(range(total_classes))]
cm = metrics.confusion_matrix(y_true=true_labels,
y_pred=predicted_labels, labels=classes)
cm_frame = pd.DataFrame(data=cm,
columns=pd.MultiIndex(levels=[['Predicted:'],
classes],
labels=level_labels),
index=pd.MultiIndex(levels=[['Actual:'], classes],
labels=level_labels))
print(cm_frame)
and it's content
then go the notebook:
at the last line of your code:
meu.display_model_performance_metrics(true_labels=y_test,predicted_labels=predictions, classes=class_labels)
you will notice that the wanted function is "display_model_performance_metrics"
= so we will went back to our model_evaluation_utils
and we will go to the function:
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
print('Model Performance metrics:')
print('-'*30)
get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
print('\nModel Classification report:')
print('-'*30)
display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,
classes=classes)
print('\nPrediction Confusion Matrix:')
print('-'*30)
display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels,
classes=classes)
and get rid of the last line:
display_confusion_matrix(true_labels=true_labels,predicted_labels=predicted_labels,classes=classes)
to become a little simpler like:
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
print('Model Performance metrics:')
print('-'*30)
get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
print('\nModel Classification report:')
print('-'*30)
display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,
classes=classes)
print('\nPrediction Confusion Matrix:')
print('-'*30)
and finally if you want to use:
display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, classes=classes)
you can use it directly in your notebook instead of file.py or python file
that's it:)

Related

UnimplementedError: Graph execution error, in google Collab with keras

Hi guys im trying to do some AI text classification with Keras and is giving me this error. Probably my layers are bad or something like that but dont really know the "Unimplemented" error.
This is my code:
history = model.fit(X_train, y_train,
epochs=100,
verbose=True,
validation_data=(X_test, y_test),
batch_size=10)
The error is:
`
UnimplementedError: Graph execution error:
Detected at node 'binary_crossentropy/Cast' defined at (most recent call last)
`
DonĀ“t know why is this happening.
Rest of the code:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import os
# print(os.listdir("../input"))
plt.style.use('ggplot')
filepath_dict = {'films': 'reviews_filmaffinity.scv'}
df_list = []
for source, filepath in filepath_dict.items():
df = pd.read_table('reviews_filmaffinity.csv', sep='\|\|', header=0, engine='python')
df['source'] = source
df_list.append(df)
df = pd.concat(df_list)
df_films = df[df['source'] == 'films']
df_films['texto'] = df_films['review_title'] + ' ' + df_films['review_text']
sentences = df_films['texto'].values
df_films['polaridad'] = df['review_rate'].apply(lambda x: 'positivo' if x > 6
else ('negativo' if x < 4
else 'neutro'))
y = df_films['polaridad'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=0)
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
X_train
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)
input_dim = X_train.shape[1] # Number of features
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
I searched online but i dont figured out how to fix that... its driving me crazy

Unable to get feature names from sklearn model, since the inputs are numpy arrays. How can I structure my code so that I can pull feature names?

I am working on a Random Forest classification model using stratified k-fold cross validation. I want to plot the feature importance of each fold. My input data is in the form of numpy arrays, however I am unable to put the feature names in my code below. How can I structure this code so that I can pull feature names, so I can plot the built-in feature importance?
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, mean_squared_error
import matplotlib.pyplot as plt
y_downsample = downsampled[['dependent_variable']].values
X_downsample = downsampled[['Feature1'
,'Feature2'
,'Feature3'
,'Feature4'
,'Feature5'
,'Feature6'
,'Feature7'
,'Feature8'
,'Feature9'
,'Feature10']].values
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
f1_results = []
accuracy_results = []
precision_results = []
recall_results = []
feature_imp = []
for train_index, test_index in skf.split(X_downsample,y_downsample):
X_train, X_test = X_downsample[train_index], X_downsample[test_index]
y_train, y_test = y_downsample[train_index], y_downsample[test_index]
model = RandomForestClassifier(n_estimators = 100, random_state = 24)
model.fit(X_train, y_train.ravel())
y_pred = model.predict(X_test)
f1_results.append(metrics.f1_score(y_test, y_pred))
accuracy_results.append(metrics.accuracy_score(y_test, y_pred))
precision_results.append(metrics.precision_score(y_test, y_pred))
recall_results.append(metrics.recall_score(y_test, y_pred))
# plot
importances = pd.DataFrame({'FEATURE':pd.DataFrame(X_downsample.columns),'IMPORTANCE':np.round(model.feature_importances_,3)})
importances = importances.sort_values('IMPORTANCE',ascending=False).set_index('FEATURE')
importances.plot.bar()
plt.show()
print("Accuracy: ", np.mean(accuracy_results))
print("Precision: ", np.mean(precision_results))
print("Recall: ", np.mean(recall_results))
print("F1-score: ", np.mean(f1_results))
--------------------------------------------------------------------------- AttributeError Traceback (most recent call
> last) in
> 21
> 22 # plot
> ---> 23 importances = pd.DataFrame({'FEATURE':pd.DataFrame(X_downsample.columns),'IMPORTANCE':np.round(model.feature_importances_,3)})
> 24 importances = importances.sort_values('IMPORTANCE',ascending=False).set_index('FEATURE')
> 25
>
> AttributeError: 'numpy.ndarray' object has no attribute 'columns'

Predicting with a trained model

I used Logistic regression to create a model ,later saved the model using joblib. Later i tried loading that model and predicting label in my test.csv . When ever i try this i get an error saying "X has 1433445 features per sample; expecting 3797015"
This is my initial code:-
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
#reading data
train=pd.read_csv('train_yesindia.csv')
test=pd.read_csv('test_yesindia.csv')
train=train.iloc[:,1:]
test=test.iloc[:,1:]
test.info()
train.info()
test['label']='t'
test=test.fillna(' ')
train=train.fillna(' ')
test['total']=test['title']+' '+test['author']+test['text']
train['total']=train['title']+' '+train['author']+train['text']
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)
targets = train['label'].values
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, targets, random_state=0)
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
print('Accuracy of Lasso classifier on training set: {:.2f}'
.format(logreg.score(X_train, y_train)))
print('Accuracy of Lasso classifier on test set: {:.2f}'
.format(logreg.score(X_test, y_test)))
targets = train['label'].values
logreg = LogisticRegression()
logreg.fit(counts, targets)
example_counts = count_vectorizer.transform(test['total'].values)
predictions = logreg.predict(example_counts)
pred=pd.DataFrame(predictions,columns=['label'])
pred['id']=test['id']
pred.groupby('label').count()
#dumping models
from joblib import dump, load
dump(logreg,'mypredmodel1.joblib')
Later i loaded model in a different code that is :-
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from joblib import dump, load
test=pd.read_csv('test_yesindia.csv')
test=test.iloc[:,1:]
test['label']='t'
test=test.fillna(' ')
test['total']=test['title']+' '+test['author']+test['text']
#check
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
test_counts = count_vectorizer.fit_transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#check
#load_model
logreg = load('mypredmodel1.joblib')
example_counts = count_vectorizer.fit_transform(test['total'].values)
predictions = logreg.predict(example_counts)
When i run it, i get the error:
predictions = logreg.predict(example_counts)
Traceback (most recent call last):
File "<ipython-input-58-f28afd294d38>", line 1, in <module>
predictions = logreg.predict(example_counts)
File "C:\Users\adars\Anaconda3\lib\site-packages\sklearn\linear_model\base.py", line 289, in predict
scores = self.decision_function(X)
File "C:\Users\adars\Anaconda3\lib\site-packages\sklearn\linear_model\base.py", line 270, in decision_function
% (X.shape[1], n_features))
ValueError: X has 1433445 features per sample; expecting 3797015
Most probably, this is because you are re-fitting your transformers in the test set. This must not be done - you should also save them fitted in your training set, and use the test (or any other future) set only for transforming data.
This is easier done with pipelines.
So, remove the following code from your first block:
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)
targets = train['label'].values
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
and replace it with:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('counts', CountVectorizer(ngram_range=(1, 2)),
('tf-idf', TfidfTransformer(smooth_idf=False))
])
pipeline.fit(train['total'].values)
tfidf = pipeline.transform(train['total'].values)
targets = train['label'].values
test_tfidf = pipeline.transform(test['total'].values)
dump(pipeline, 'transform_predict.joblib')
Now, in your second code block, remove this part:
#check
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
test_counts = count_vectorizer.fit_transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#check
and replace it with:
pipeline = load('transform_predict.joblib')
test_tfidf = pipeline.transform(test['total'].values)
And you should be fine, provided that you predict the test_tfidf variable, and not the example_counts which are not transfomed by TF-IDF:
predictions = logreg.predict(test_tfidf)

python, sklearn: 'dict' object is not callable using GridSearchCV and SVC

I'm trying to use GridSearchCV to optimize the parameters for the classifier svm.SVC (both from sklearn).
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import numpy as np
X_train = np.array([[1,2],[3,4],[5,6],[2,3],[9,4],[4,5],[2,7],[1,0],[4,7],[2,9])
Y_train = np.array([0,1,0,1,0,0,1,1,0,1])
X_test = np.array([[2,4],[5,3],[7,1],[2,4],[6,4],[2,7],[9,2],[7,5],[1,6],[0,3]])
Y_test = np.array([1,0,0,0,1,0,1,1,0,0])
parameters = {'kernel':['rbf'],'C':np.linspace(10,100,10)}
clf1 = GridSearchCV(SVC(), parameters, verbose = 10)
clf1.fit(X_train, Y_train)
cm = confusion_matrix(Y_test, clf1.predict(X_test))
bp = clf1.best_params_
The output shows it completing GridSearchCV, but then it throws the error:
Traceback (most recent call last):
File "<ipython console>", line 1, in <module>
File "C:\Python27\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 479, in runfile
execfile(filename, namespace)
File "I:\setup\Desktop\Stats\FinalProject.py", line 112, in <module>
clf1 = GridSearchCV(SVC(), parameters, verbose = 10)
TypeError: 'dict' object is not callable
When I am running the code you posted:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import numpy as np
X_train = np.array([[1,2],[3,4],[5,6]])
Y_train = np.array([0,1,0])
X_test = np.array([[2,4],[5,3],[7,1]])
Y_test = np.array([1,0,0])
parameters = {'kernel':['rbf'],'C':np.linspace(10,100,10)}
clf1 = GridSearchCV(SVC(), parameters, verbose = 10)
clf1.fit(X_train, Y_train)
cm = confusion_matrix(Y_test, clf1.predict(X_test))
bp = clf1.best_params_
I'm getting this error:
File "C:\Anaconda\lib\site-packages\sklearn\svm\base.py", line 447, in _validate_targets
% len(cls))
ValueError: The number of classes has to be greater than one; got 1
Since the train data consist of 3 samples, when the GridSearchCV break the data into 3 folds (BTW you can control this parameter, it is called cv).
e.g. -
fold1 = [1,2] , label1 = 0
fold2 = [3,4] , label2 = 1
fold3 = [5,6] , label3 = 0
Now, in some iteration, it takes the first and the third folds to train on, and the second fold is used for validation.
Please note that these training folds contains only 1 type of label! (the label 0) hence the error it prints.
If I create the data in this manner:
X, Y = datasets.make_classification(n_samples=1000, n_features=4,
n_informative=2, n_redundant=2, n_classes=2)
X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X,Y,
test_size =0.2)
It runs just fine.
I guess you have some other problem, but regarding the code you entered - this is the error it has.

scikit-learn ValueError: dimension mismatch

This is my first time posting here. For the past couple of days I have been trying to teach myself scikit-learn. But recently I have encountered an error that has been nagging me for quite some time.
My goal is simply to train a NB classifier cli so that I can feed it an arbitrary list of strings called new_doc and it will predict what class the string is likely to belong to.
This is what my program looks like:
#Importing stuff
import numpy as np
import pylab
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn import metrics
#Opening the csv file
df = pd.read_csv('data.csv', sep=',')
#Randomising the rows in the file
df = df.reindex(np.random.permutation(df.index))
#Extracting features from text, define target y and data X
vect = CountVectorizer()
X = vect.fit_transform(df['Features'])
y = df['Target']
#Partitioning the data into test and training set
SPLIT_PERC = 0.75
split_size = int(len(y)*SPLIT_PERC)
X_train = X[:split_size]
X_test = X[split_size:]
y_train = y[:split_size]
y_test = y[split_size:]
#Training the model
clf = MultinomialNB()
clf.fit(X_train, y_train)
#Evaluating the results
print "Accuracy on training set:"
print clf.score(X_train, y_train)
print "Accuracy on testing set:"
print clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
print "Classification Report:"
print metrics.classification_report(y_test, y_pred)
#Predicting new data
new_doc = ["MacDonalds", "Walmart", "Target", "Starbucks"]
trans_doc = vect.transform(new_doc) #extracting features
y_pred = clf.predict(trans_doc) #predicting
But when I run the program I get the following error on the last row:
y_pred = clf.predict(trans_doc)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/sklearn/naive_bayes.py", line 62, in predict
jll = self._joint_log_likelihood(X)
File "/Library/Python/2.7/site-packages/sklearn/naive_bayes.py", line 441, in _joint_log_likelihood
return (safe_sparse_dot(X, self.feature_log_prob_.T)
File "/Library/Python/2.7/site-packages/sklearn/utils/extmath.py", line 175, in safe_sparse_dot
ret = a * b
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/sparse/base.py", line 334, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
So apparently it has something to do with the dimension of the term-document matrixes.
When I check the dimensions of trans_doc, X_train and X_test i get:
>>> trans_doc.shape
(4, 4)
>>> X_train.shape
(145314, 28750)
>>> X_test.shape
(48439, 28750)
In order for y_pred = clf.predict(trans_doc) to work I need to (from what I understand it) transform new_doc into a term-document matrix with the dimensions (4, 28750). But I don't know of any methods within CountVectorizer that lets me do this.

Categories

Resources