I'm trying to list all the wrong predictions in a test set, but quite unsure how to do it. I tried Stackoverflow, but might have searched for the wrong "problem". So I have these text files from a folder, containing emails. The problems is that my predictions isn't doing to well, and I want to inspect the emails that is predicted wrong. Currently a snippet of my code looks something like this:
no_head_train_path_0 = 'folder_name'
no_head_train_path_1 = 'folder_name'
def get_data(path):
text_list = list()
files = os.listdir(path)
for text_file in files:
file_path = os.path.join(path, text_file)
read_file = open(file_path,'r+')
read_text = read_file.read()
read_file.close()
cleaned_text = clean_text(read_text)
text_list.append(cleaned_text)
return text_list, files
no_head_train_0, temp = get_data(no_head_train_path_0)
no_head_train_1, temp1 = get_data(no_head_train_path_1)
no_head_train = no_head_train_0 + no_head_train_1
no_head_labels_train = ([0] * len(no_head_train_0)) + ([1] * len(no_head_train_1))
def vocabularymat(TEXTFILES,VOC,PLAY,METHOD):
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
if (METHOD == "TDM"):
voc = CountVectorizer()
voc.fit(VOC)
if (PLAY == "TRAIN"):
TrainMat = voc.transform(TEXTFILES)
return TrainMat
if (PLAY =="TEST"):
TestMat = voc.transform(TEXTFILES)
return TestMat
TrainMat = vocabularymat(no_head_train,no_head_train,PLAY= "TRAIN",METHOD="TDM")
X_train = Featurelearning(Traindata, Method="NMF")
y_train = datalabel
X_train, X_test, y_train, y_test = train_test_split(data, datalabel, test_size=0.33,
random_state=42
model = LogisticRegression()
model.fit(X_train, y_train)
expected = y_test
predicted = model.predict(X_test)
proba = model.predict_proba(X_test)
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")
Is it possible to find the emails/filename that are predicted wrong, so I can manually inspect them? (Sorry for the long code)
You can use NumPy to create a Boolean vector indicating which predictions are wrong, and then use that vector to index your array of file names. For example:
import numpy as np
# mock data
files = np.array(['mail1.txt', 'mail2.txt', 'mail3.txt', 'mail4.txt'])
y_test = np.array([0, 0, 1, 1])
predicted = np.array([0, 1, 0, 1])
# create a Boolean index for the wrong classifications
classification_is_wrong = y_test != predicted
# print the file names of the wrongly classified mails
print(files[classification_is_wrong])
Output:
['mail2.txt' 'mail3.txt']
# find the wrong prediction
prediction = model.predict(x_test)
# save the wrong predicted values
wrong_predict = []
for order, value in enumerate(y_test):
if y_test[order] != prediction[order].argmax():
wrong_predict.append(order)
print(wrong_predict)
Related
tydef prepare_data(batch_size):
(X_train, y_train)=load_data(TRAIN_DIR)
(X_test, y_test) = load_data(TEST_DIR)
X_all = np.concatenate([X_train, X_test])
y_all = np.concatenate([y_train, y_test])
X_all = X_all.astype(np.float32) / 255
X_all = X_all.reshape(-1, 28, 28, 1) * 2. - 1.
y_all = keras.utils.to_categorical(y_all, 10)
dataset = tf.data.Dataset.from_tensor_slices((X_all, y_all))
dataset = dataset.shuffle(1024)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True).prefetch(1)pe here
return dataset
this is the script to load the directory files using TRAIN_DIR valuable
but when I call the function "dataset = prepare_data(BATCH_SIZE)" it says "too many values to unpack (expected 2)"
can you share your experiences
Based on the comments, you have a function load_data like this:
def load_data(dir_path, img_size=(100,100)):
""" Load resized images as np.arrays to workspace """
X = []
y = []
i = 0
label = dict()
X = np.array(X)
y = np.array(y)
print(f'{len(X)} images loaded from {dir_path} directory.')
return X, y, label
which return two numpy arrays and one dictionary.
So I would change the beginning of the function prepare_data like so:
def prepare_data(batch_size):
X_train, y_train, label_train = load_data(TRAIN_DIR)
X_test, y_test, label_test = load_data(TEST_DIR)
to match load_data signature.
I am working on a LogisticRegressing tex classifier. The classifier's job is to label data as spam or ham.
Initially I have 1 feature(just the text), but then later I am adding 3 more features:
The length of document (number of characters)
Number of digits per document from the document
Number of non-word characters from the document
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
import re
from varname import nameof
##-----------------------------------------------------------------------------
#
def add_feature(X, feature_to_add):
X_modified = hstack([X, csr_matrix(feature_to_add).T], 'csr')
return(X_modified)
##-----------------------------------------------------------------------------
#
def feature_extractor(series_data):
series_doc_len = []
series_digits = []
series_non_alphas = []
entry = 0
for (idx, text) in enumerate(series_data):
text_length = (len(text))
text_digits = sum(c.isdigit() for c in text)
text_non_alphas = re.findall(r'\W+', text)
text_non_alphas_count = len(text_non_alphas)
series_doc_len.append(text_length)
series_digits.append(text_digits)
series_non_alphas.append(text_non_alphas_count)
series_doc_len_series = pd.Series(series_doc_len)
series_digits_series = pd.Series(series_digits)
series_non_alphas_series = pd.Series(series_non_alphas)
series_doc_len_renamed = series_doc_len_series.rename('length_of_doc')
series_digits_renamed = series_digits_series.rename('digit_count')
series_non_alphas_renamed = series_non_alphas_series.rename('non_word_char_count')
return(series_doc_len_renamed, series_digits_renamed, series_non_alphas_renamed)
##-----------------------------------------------------------------------------
#
def load_csv_data(file_name):
spam_data_df = pd.read_csv(file_name)
spam_data_df['target'] = np.where(spam_data_df['target']=='spam',1,0)
X_train, X_test, y_train, y_test = train_test_split(spam_data_df['text'],
spam_data_df['target'],
test_size=0.3,
random_state=0)
return(X_train, X_test, y_train, y_test)
##-----------------------------------------------------------------------------
file_name = "../data/spam-dummy.csv"
X_train, X_test, y_train, y_test = load_csv_data(file_name)
vectorizer = CountVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb')
X_train_vectorized = vectorizer.fit_transform(X_train)
(X_train_doclen, X_train_numdigits, X_train_nonalpha) = feature_extractor(X_train)
for feature in (X_train_doclen, X_train_numdigits, X_train_nonalpha):
X_train_vectorized = add_feature(X_train_vectorized, feature)
X_test_vectorized = vectorizer.transform(X_test)
(X_test_doclen, X_test_numdigits, X_test_nonalpha) = feature_extractor(X_test)
for feature in (X_test_doclen, X_test_numdigits, X_test_nonalpha):
X_test_vectorized = add_feature(X_test_vectorized, feature)
classifier = LogisticRegression(C=100, solver='liblinear')
classifier.fit(X_train_vectorized, y_train)
y_predicted = classifier.predict(X_test_vectorized)
feature_names = np.array(vectorizer.get_feature_names_out() + ['length_of_doc', 'digit_count', 'non_word_char_count'])
sorted_coef_index = classifier.coef_[0].argsort()
smallest = feature_names[sorted_coef_index[:10]]
largest = feature_names[sorted_coef_index[:-11:-1]]
After running the prediction, I am trying to pull smallest/largest coefficients from the model, including the additional three features along with their names.
File "/Users/ukhan/Development/github/education.git/coursera/applied_text_mining_in_python/labs/lab-3/supplimental/code/tfidf-kavitha.py", line 92, in <module>
feature_names = np.array(vectorizer.get_feature_names_out() + ['length_of_doc', 'digit_count', 'non_word_char_count'])
ValueError: operands could not be broadcast together with shapes (15569,) (3,)
What is the correct way to approach this?
I then added the following code to see if the feature-name I added was actually there, but don't see them:
feature_names = np.array(vectorizer.get_feature_names_out())
for feature_name in feature_names:
print(f" Inspecting feature: {feature_name}")
if(feature_name == 'length_of_doc'):
print(f' Feature name: {feature_name} has been found')
elif(feature_name == 'digit_count'):
print(f' Feature name: {feature_name} has been found')
elif(feature_name == 'non_word_char_count'):
print(f' Feature name: {feature_name} has been found')
``
I wrote the following code to loop the same function over different dataframes (named "Drought", "Flashflood",etc). I was happy to see that it worked, but I'm trying to determine how to get the names of the dataframes to append with the train and test scores. Can someone please guide me on what I'm missing here? If I do it like I currently have it, all the names post in each row at the bottom, but I only want the corresponding one. Similarly, the output I get appends each new array together, but my understanding was that append would just add a new item to a list?
For example I'm getting this as a result:
[(0.11995478823013683, -0.07264567664161303), (0.11998113643282327, -0.034458152253100005)]
But I would expect this:
[("Drought",0.11995478823013683, -0.07264567664161303)]
[("Flashflood",0.11998113643282327, -0.034458152253100005)]
Here's the code:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
df_list = [Drought, Flashflood, Flood, Gale]
names = ['Drought','Flashflood','Flood','Gale']
knn_r_acc = []
rmse_val = [] #to store rmse values for different dataframes
for df in df_list:
X = df[['Year.Month','IDH.M_2000','Population','IDH.M_2010']]
y = df['Deceased'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Scaling
scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = scaler.fit_transform(X_train)
x_train = pd.DataFrame(x_train_scaled)
x_test_scaled = scaler.fit_transform(X_test)
x_test = pd.DataFrame(x_test_scaled)
model = neighbors.KNeighborsRegressor(n_neighbors = 3, weights = 'uniform')
model.fit(x_train, y_train) #fit the model
pred=model.predict(x_test) #make prediction on test set
error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
rmse_val.append(error) #store rmse values
#print('Model= ' , df, 'is:', error)
knn.fit(X_train,y_train)
test_score = knn.score(X_test,y_test)
train_score = knn.score(X_train,y_train)
#print(test_score)
#print(train_score)
knn_r_acc.append((names,train_score, test_score))
print(knn_r_acc)
In your case, names is actually a whole list / array.
You can implement it using an index variable. So before the loop starts, add:
name_index = 0
and inside the loop, append like so:
knn_r_acc.append((names[name_index], train_score, test_score))
name_index += 1
I wrote a basic code for my binary classification problem.
I have problems about understanding how lime works.
Actually it has one hot encoders and scaler by using pipeline but, I tried to simplify the code as I couldn't progress. But I don't understand what is the problem?
categorical_features = []
counter = 0
feature_names = []
for i in df.columns:
if i not in target_columns :
feature_names.append(i)
counter = counter + 1
if df[i].dtype == 'O':
categorical_features.append(counter)
df = df.to_numpy()
categorical_names = {}
for feature in categorical_features:
le = sklearn.preprocessing.LabelEncoder()
le.fit(df[:, feature])
df[:, feature] = le.transform(df[:, feature])
categorical_names[feature] = le.classes_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True, random_state=42)
model= xgb.XGBClassifier(objective="binary:logistic",
colsample_bytree = 0.5
)
#Define a pipeline
model.fit(X_train, y_train)
This is the basic version of my code;
And then i want to see ;
import lime
import lime.lime_tabular
class_names = ['0','1']
explainer = lime.lime_tabular.LimeTabularExplainer(X_train ,feature_names = feature_names,class_names=class_names,
categorical_features=categorical_features,
categorical_names=categorical_names, kernel_width=3
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names))
But i got following error;
list index out of range (> explainer = lime.lime_tabular.LimeTabularExplainer...)
Please ask if I'm missing information and I'll complete it.
I have written the following code to import data vectors from file and test the performance of SVM classifier (using sklearn and python).
However the classifier performance is lower than any other classifier (NNet for example gives 98% accuracy on test data but this gives 92% at best). In my experience SVM should produce better results for this kind of data.
Am I possibly doing something wrong?
import numpy as np
def buildData(featureCols, testRatio):
f = open("car-eval-data-1.csv")
data = np.loadtxt(fname = f, delimiter = ',')
X = data[:, :featureCols] # select columns 0:featureCols-1
y = data[:, featureCols] # select column featureCols
n_points = y.size
print "Imported " + str(n_points) + " lines."
### split into train/test sets
split = int((1-testRatio) * n_points)
X_train = X[0:split,:]
X_test = X[split:,:]
y_train = y[0:split]
y_test = y[split:]
return X_train, y_train, X_test, y_test
def buildClassifier(features_train, labels_train):
from sklearn import svm
#clf = svm.SVC(kernel='linear',C=1.0, gamma=0.1)
#clf = svm.SVC(kernel='poly', degree=3,C=1.0, gamma=0.1)
clf = svm.SVC(kernel='rbf',C=1.0, gamma=0.1)
clf.fit(features_train, labels_train)
return clf
def checkAccuracy(clf, features, labels):
from sklearn.metrics import accuracy_score
pred = clf.predict(features)
accuracy = accuracy_score(pred, labels)
return accuracy
features_train, labels_train, features_test, labels_test = buildData(6, 0.3)
clf = buildClassifier(features_train, labels_train)
trainAccuracy = checkAccuracy(clf, features_train, labels_train)
testAccuracy = checkAccuracy(clf, features_test, labels_test)
print "Training Items: " + str(labels_train.size) + ", Test Items: " + str(labels_test.size)
print "Training Accuracy: " + str(trainAccuracy)
print "Test Accuracy: " + str(testAccuracy)
i = 0
while i < labels_test.size:
pred = clf.predict(features_test[i])
print "F(" + str(i) + ") : " + str(features_test[i]) + " label= " + str(labels_test[i]) + " pred= " + str(pred);
i = i + 1
How is it possible to do multi-class classification if it does not do it by default?
p.s. my data is of the following format (last column is the class):
2,2,2,2,2,1,0
2,2,2,2,1,2,0
0,2,2,5,2,2,3
2,2,2,4,2,2,1
2,2,2,4,2,0,0
2,2,2,4,2,1,1
2,2,2,4,1,2,1
0,2,2,5,2,2,3
I found the problem after a long time and I am posting it, in case someone needs it.
The problem was that the data import function wouldn't shuffle the data. If the data is somehow sorted, then there is the risk that you train the classifier with some data and test it with totally different data. In the NNet case, Matlab was used which automatically shuffles the input data.
def buildData(filename, featureCols, testRatio):
f = open(filename)
data = np.loadtxt(fname = f, delimiter = ',')
np.random.shuffle(data) # randomize the order
X = data[:, :featureCols] # select columns 0:featureCols-1
y = data[:, featureCols] # select column featureCols
n_points = y.size
print "Imported " + str(n_points) + " lines."
### split into train/test sets
split = int((1-testRatio) * n_points)
X_train = X[0:split,:]
X_test = X[split:,:]
y_train = y[0:split]
y_test = y[split:]
return X_train, y_train, X_test, y_test