KNN distance and class vote - python

Can you please tell me how to calculate distance between every point in my testData properly.
For now I am getting only one single value, whereas I should get distance from each point in data set and be able to assign it a class. I have to use numpy for this.
========================================================================
Now the problem is that I am getting this error and don't know how to fix it.
KeyError: 0
I am trying to obtain accuracy of classified labels.
Any ideas, please?
import matplotlib.pyplot as plt
import random
import numpy as np
import operator
from sklearn.cross_validation import train_test_split
# In[1]
def readFile():
f = open('iris.data', 'r')
d = np.dtype([ ('features',np.float,(4,)),('class',np.str_,20)])
data = np.genfromtxt(f, dtype = d ,delimiter=",")
dataPoints = data['features']
labels = data['class']
return dataPoints, labels
# In[2]
def normalizeData(dataPoints):
#normalize the data so the values will be between 0 and 1
dataPointsNorm = (dataPoints - dataPoints.min())/(dataPoints.max() - dataPoints.min())
return dataPointsNorm
def crossVal(dataPointsNorm):
# spliting for train and test set for crossvalidation
trainData, testData = train_test_split(dataPointsNorm, test_size=0.20, random_state=25)
return trainData, testData
def calculateDistance(trainData, testData):
#Euclidean distance calculation on numpy arrays
distance = np.sqrt(np.sum((trainData - testData)**2, axis=-1))
# Argsort sorts indices from closest to furthest neighbor, in ascending order
sortDistance = distance.argsort()
return distance, sortDistance
# In[4]
def classifyKnn(testData, trainData, labels, k):
# Calculating nearest neighbours and based on majority vote assigning the class
classCount = {}
for i in range(k):
distance, sortedDistIndices = calculateDistance(trainData, testData[i])
voteLabel = labels[sortedDistIndices][i]
#print voteLabel
classCount[voteLabel] = classCount.get(voteLabel,0)+1
print 'Class Count: ', classCount
# Sorting dictionary to return voted class
sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0], classCount
def testAccuracy(testData, classCount):
correct = 0
for x in range(len(testData)):
print 'HERE !!!!!!!!!!!!!!'
if testData[x][-1] is classCount[x]:
correct += 1
return (correct/float(len(testData))) * 100.0
def main():
dataPoints, labels = readFile()
dataPointsNorm = normalizeData(dataPoints)
trainData, testData = crossVal(dataPointsNorm)
result, classCount = classifyKnn(testData, trainData, labels, 5)
print result
accuracy = testAccuracy(testData, classCount)
print accuracy
main()
I have it normalized, split into train and test calc distance (wrong).
Thanks for any tips.

Related

Accuracy of KNN algorithm very low

I'm following sentdex's youtube channel ML tutorial.
So as I was coding along on how to build your own KNN algorithm, I noticed that my accuracy was very low, in the 60s almost every time. I had made a few changes, but then I used his code line by line, and the same dataset, yet somehow he gets accuracies in the range of 95-98%, while mine is 60-70%. I'm really not able to figure out the reason behind such a huge difference.
I also have a second question which has to do with the confidence of the predictions. The value of the confidence is supposed to be within 0-1 right? But for me, they're all identical, and in the 70s. Let me explain with a screenshot
My code:
# Importing libraries
import numpy as np
import pandas as pd
from collections import Counter
import warnings
import random
# Algorithm
def k_nearest(data,predict,k=5):
if len(data)>=k:
warnings.warn("stupid, your data has more dimensions than prescribed")
distances = []
for group in data: # The groups of 2s and 4s
for features in data[group]: # values in 2 and 4 respectively
#euclidean_distance = np.sqrt(np.sum((np.array(features) - np.sum(predict)) **2))
euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
distances.append([euclidean_distance,group])
votes = [i[1] for i in sorted(distances)] # adding the sorted(ascending) group names
votes_result = Counter(votes).most_common(1)[0][0] # the most common element
confidence = float((Counter(votes).most_common(1)[0][1]))/float(k)#ocuurences of the most common element
return votes_result,confidence
#reading the data
df = pd.read_csv("breast_cancer.txt")
df.replace("?",-99999,inplace=True)
#df.replace("?", np.nan,inplace=True)
#df.dropna(inplace=True)
df.drop("id",axis = 1,inplace=True)
full_data = df.astype(float).values.tolist() # Converting to list because our function is written like that
random.shuffle(full_data)
#print(full_data[:10])
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # Upto the last 20% of the og dateset
test_data = full_data[-int(test_size*len(full_data)):] # The last 20% of the dataset
# Populating the dictionary
for i in train_data:
train_set[i[-1]].append(i[:-1]) # appending with features and leaving out the label
for i in test_data:
test_set[i[-1]].append(i[:-1]) # appending with features and leaving out the label
# Testing
correct,total = 0,0
for group in test_set:
for data in test_set[group]:
vote,confidence = k_nearest(train_set, data,k=5)
if vote == group:
correct +=1
else:
print(confidence)
total += 1
print("Accuracy is",correct/total)
Link to the dataset breast-cancer-wisconsin.data
There's a mistake in your k_nearest function, you need to return only the top k of distances, not the whole list. So it should be:
votes = [i[1] for i in sorted(distances)[:k]]
Instead of in your code:
votes = [i[1] for i in sorted(distances)]
We can rewrite your function:
def k_nearest(data,predict,k=5):
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
distances.append([euclidean_distance,group])
votes = [i[1] for i in sorted(distances)[:k]]
votes_result = Counter(votes).most_common(1)[0][0]
confidence = float((Counter(votes).most_common(1)[0][1]))/float(k)
return votes_result,confidence
And run your code, I am not so sure about replacing "?" with -999 so I read it in as na :
import pandas as pd
from collections import Counter
import random
import numpy as np
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df = pd.read_csv(url,header=None,na_values="?")
df = df.dropna()
full_data = df.iloc[:,1:].astype(float).values.tolist()
random.seed(999)
random.shuffle(full_data)
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size*len(full_data)):]
for i in train_data:
train_set[i[-1]].append(i[:-1])
for i in test_data:
test_set[i[-1]].append(i[:-1])
correct,total = 0,0
for group in test_set:
for data in test_set[group]:
vote,confidence = k_nearest(train_set, data,k=5)
if vote == group:
correct +=1
else:
print(confidence)
total += 1
print("Accuracy is",correct/total)
Gives:
1.0
0.8
1.0
0.6
0.6
0.6
0.6
Accuracy is 0.9485294117647058

Expand Confusion Matrix Insert Information

On each surface I would like to have, the actual number the predicitons.
I don't really care if it's just percentages or numbers. I would also like to label them with True Positive and False Negative.
The Code:
sns.heatmap(pd.crosstab(ytest,classifier.predict(xtest)),cmap='Spectral')
plt.xlabel('predicted')
plt.ylabel('actual')
plt.show()
I Use below to do what you want, though a google search will also give you answer
def find_best_threshold(threshold, fpr, tpr):
t = threshold[np.argmax(tpr * (1-fpr))]
### TPR * TNR ---> We are trying to maximize TNR and TPR
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_thresh(prob,t):
pred=[1 if i>=t else 0 for i in prob ]
return pred
### https://medium.com/#dtuk81/confusion-matrix-visualization-fc31e3f30fea
def conf_matrix_plot(cf_matrix,title):
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, vQ3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
#sns.set(font_scale=1.5)
sns.heatmap(cf_matrix, annot=labels, fmt='',cmap='coolwarm').set_title(title + ' Confusion Matrix for TFIDF')
plt.xlabel('Actual')
plt.ylabel('Predicted')
from sklearn.metrics import confusion_matrix
import numpy as np
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
cf_matrix_train = confusion_matrix(y_train, predict_with_best_thresh(y_train_pred[:,1], best_t))
cf_matrix_test = confusion_matrix(y_test, predict_with_best_thresh(y_test_pred[:,1], best_t))
conf_matrix_plot(cf_matrix_train,'Train')
Result:

sample X examples from each class label

l have a dataset (numpy vector) with 50 classes and 9000 training examples.
x_train=(9000,2048)
y_train=(9000,) # Classes are strings
classes=list(set(y_train))
l would like to build a sub-dataset such that each class will have 5 examples
which means l get 5*50=250 training examples. Hence my sub-dataset will take this form :
sub_train_data=(250,2048)
sub_train_labels=(250,)
Remark : we take randomly 5 examples from each class (total number of classes = 50)
Thank you
Here is a solution for that problem :
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
def balanced_sample_maker(X, y, sample_size, random_seed=42):
uniq_levels = np.unique(y)
uniq_counts = {level: sum(y == level) for level in uniq_levels}
if not random_seed is None:
np.random.seed(random_seed)
# find observation index of each class levels
groupby_levels = {}
for ii, level in enumerate(uniq_levels):
obs_idx = [idx for idx, val in enumerate(y) if val == level]
groupby_levels[level] = obs_idx
# oversampling on observations of each label
balanced_copy_idx = []
for gb_level, gb_idx in groupby_levels.items():
over_sample_idx = np.random.choice(gb_idx, size=sample_size, replace=True).tolist()
balanced_copy_idx+=over_sample_idx
np.random.shuffle(balanced_copy_idx)
data_train=X[balanced_copy_idx]
labels_train=y[balanced_copy_idx]
if ((len(data_train)) == (sample_size*len(uniq_levels))):
print('number of sampled example ', sample_size*len(uniq_levels), 'number of sample per class ', sample_size, ' #classes: ', len(list(set(uniq_levels))))
else:
print('number of samples is wrong ')
labels, values = zip(*Counter(labels_train).items())
print('number of classes ', len(list(set(labels_train))))
check = all(x == values[0] for x in values)
print(check)
if check == True:
print('Good all classes have the same number of examples')
else:
print('Repeat again your sampling your classes are not balanced')
indexes = np.arange(len(labels))
width = 0.5
plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels)
plt.show()
return data_train,labels_train
X_train,y_train=balanced_sample_maker(X,y,10)
inspired by Scikit-learn balanced subsampling
Pure numpy solution:
def sample(X, y, samples):
unique_ys = np.unique(y, axis=0)
result = []
for unique_y in unique_ys:
val_indices = np.argwhere(y==unique_y).flatten()
random_samples = np.random.choice(val_indices, samples, replace=False)
ret.append(X[random_samples])
return np.concatenate(result)
I usually use a trick from scikit-learn for this. I use the StratifiedShuffleSplit function. So if I have to select 1/n fraction of my train set, I divide the data into n folds and set the proportion of test data (test_size) as 1-1/n. Here is an example where I use only 1/10 of my data.
sp = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=seed)
for train_index, _ in sp.split(x_train, y_train):
x_train, y_train = x_train[train_index], y_train[train_index]
You can use dataframe as input (as in my case), and use simple code below:
col = target
nsamples = min(t4m[col].value_counts().values)
res = pd.DataFrame()
for val in t4m[col].unique():
t = t4m.loc[t4m[col] == val].sample(nsamples)
res = pd.concat([res, t], ignore_index=True).sample(frac=1)
col is the name of your column with classes. Code finds minority class, shuffles dataframe, then takes sample of size of minority class from each class.
Then you can convert result back to np.array

Precision, recall, F1 score equal with sklearn

I'm trying to compare different distance calculating methods and different voting systems in k-nearest neighbours algorithm. Currently my problem is that no matter what I do precision_recall_fscore_support method from scikit-learn yields exactly the same results for precision, recall and fscore. Why is that? I've tried it on different datasets (iris, glass and wine). What am I doing wrong? The code so far:
#!/usr/bin/env python3
from collections import Counter
from data_loader import DataLoader
from sklearn.metrics import precision_recall_fscore_support as pr
import random
import math
import ipdb
def euclidean_distance(x, y):
return math.sqrt(sum([math.pow((a - b), 2) for a, b in zip(x, y)]))
def manhattan_distance(x, y):
return sum(abs([(a - b) for a, b in zip(x, y)]))
def get_neighbours(training_set, test_instance, k):
names = [instance[4] for instance in training_set]
training_set = [instance[0:4] for instance in training_set]
distances = [euclidean_distance(test_instance, training_set_instance) for training_set_instance in training_set]
distances = list(zip(distances, names))
print(list(filter(lambda x: x[0] == 0.0, distances)))
sorted(distances, key=lambda x: x[0])
return distances[:k]
def plurality_voting(nearest_neighbours):
classes = [nearest_neighbour[1] for nearest_neighbour in nearest_neighbours]
count = Counter(classes)
return count.most_common()[0][0]
def weighted_distance_voting(nearest_neighbours):
distances = [(1/nearest_neighbour[0], nearest_neighbour[1]) for nearest_neighbour in nearest_neighbours]
index = distances.index(min(distances))
return nearest_neighbours[index][1]
def weighted_distance_squared_voting(nearest_neighbours):
distances = list(map(lambda x: 1 / x[0]*x[0], nearest_neighbours))
index = distances.index(min(distances))
return nearest_neighbours[index][1]
def main():
data = DataLoader.load_arff("datasets/iris.arff")
dataset = data["data"]
# random.seed(42)
random.shuffle(dataset)
train = dataset[:100]
test = dataset[100:150]
classes = [instance[4] for instance in test]
predictions = []
for test_instance in test:
prediction = weighted_distance_voting(get_neighbours(train, test_instance[0:4], 15))
predictions.append(prediction)
print(pr(classes, predictions, average="micro"))
if __name__ == "__main__":
main()
The problem is that you're using the 'micro' average.
As stated here:
As is written in the documentation: "Note that for “micro”-averaging
in a multiclass setting will produce equal precision, recall and
[image: F], while “weighted” averaging may produce an F-score that is
not between precision and recall."
http://scikit-learn.org/stable/modules/model_evaluation.html
But if you drop a majority label, using the labels parameter, then
micro-averaging differs from accuracy, and precision differs from
recall.

Not able to set threshold in facerec framework ( face recognition)

I am new to face recognition. I am trying to do face recognition with the help of bytefish facerec framework. It works fine but results are not very accurate. Therefore, I want to put threshold. As per suggested on his page (https://github.com/bytefish/facerec), I should be able to do it. However, explanation on page isnt very clear. So heres what I am doing.
My classifier
def predict(self, q):
distances = []
for xi in self.X:
xi = xi.reshape(-1,1)
d = self.dist_metric(xi, q)
distances.append(d)
if len(distances) > len(self.y):
raise Exception("More distances than classes. Is your distance metric correct?")
distances = np.asarray(distances)
# Get the indices in an ascending sort order:
idx = np.argsort(distances)
# Sort the labels and distances accordingly:
sorted_y = self.y[idx]
sorted_distances = distances[idx]
# Take only the k first items:
sorted_y = sorted_y[0:self.k]
sorted_distances = sorted_distances[0:self.k]
# Make a histogram of them:
hist = dict((key,val) for key, val in enumerate(np.bincount(sorted_y)) if val)
# And get the bin with the maximum frequency:
predicted_label = max(hist.iteritems(), key=op.itemgetter(1))[0]
# A classifier should output a list with the label as first item and
# generic data behind. The k-nearest neighbor classifier outputs the
# distance of the k first items. So imagine you have a 1-NN and you
# want to perform a threshold against it, you should take the first
# item
return [predicted_label, { 'labels' : sorted_y, 'distances' : sorted_distances }]
My model
def predict(self, X):
q = self.feature.extract(X)
return self.classifier.predict(q)
My server.py which generates the output
def get_prediction(image_data):
image = preprocess_image(image_data)
prediction = model.predict(image)
predicted_label = prediction[0]
classifier_output = prediction[1]
distance = classifier_output['distances'][0]
#distance = classifier.predict(self, q)
#distance = 11
if distance > 10.0:
return "nonsense"
else:
print prediction
So the problem is I am not able to get the distance here. Please help
after a while I was able to solve the problem. Threshold should be done in classifier file not in server.py.
Solution
distances = []
for xi in self.X:
xi = xi.reshape(-1,1)
d = self.dist_metric(xi, q)
distances.append(d)
if len(distances) > len(self.y):
raise Exception("More distances than classes. Is your distance metric correct?")
distances = np.asarray(distances)
# Get the indices in an ascending sort order:
idx = np.argsort(distances)
# Sort the labels and distances accordingly:
sorted_y = self.y[idx]
sorted_distances = distances[idx]
# Take only the k first items:
sorted_y = sorted_y[0:self.k]
sorted_distances = sorted_distances[0:self.k]
#sorted_distances = 1134.04873217
# Make a histogram of them:
hist = dict((key,val) for key, val in enumerate(np.bincount(sorted_y)) if val)
# And get the bin with the maximum frequency:
predicted_label = max(hist.iteritems(), key=op.itemgetter(1))[0]
# A classifier should output a list with the label as first item and
# generic data behind. The k-nearest neighbor classifier outputs the
#global unknown
if sorted_distances > 1800 :
return [predicted_label]
else:
return [predicted_label]

Categories

Resources