sample X examples from each class label

sample X examples from each class label - python

l have a dataset (numpy vector) with 50 classes and 9000 training examples.
x_train=(9000,2048)
y_train=(9000,) # Classes are strings
classes=list(set(y_train))
l would like to build a sub-dataset such that each class will have 5 examples
which means l get 5*50=250 training examples. Hence my sub-dataset will take this form :
sub_train_data=(250,2048)
sub_train_labels=(250,)
Remark : we take randomly 5 examples from each class (total number of classes = 50)
Thank you

Here is a solution for that problem :
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
def balanced_sample_maker(X, y, sample_size, random_seed=42):
uniq_levels = np.unique(y)
uniq_counts = {level: sum(y == level) for level in uniq_levels}
if not random_seed is None:
np.random.seed(random_seed)
# find observation index of each class levels
groupby_levels = {}
for ii, level in enumerate(uniq_levels):
obs_idx = [idx for idx, val in enumerate(y) if val == level]
groupby_levels[level] = obs_idx
# oversampling on observations of each label
balanced_copy_idx = []
for gb_level, gb_idx in groupby_levels.items():
over_sample_idx = np.random.choice(gb_idx, size=sample_size, replace=True).tolist()
balanced_copy_idx+=over_sample_idx
np.random.shuffle(balanced_copy_idx)
data_train=X[balanced_copy_idx]
labels_train=y[balanced_copy_idx]
if ((len(data_train)) == (sample_size*len(uniq_levels))):
print('number of sampled example ', sample_size*len(uniq_levels), 'number of sample per class ', sample_size, ' #classes: ', len(list(set(uniq_levels))))
else:
print('number of samples is wrong ')
labels, values = zip(*Counter(labels_train).items())
print('number of classes ', len(list(set(labels_train))))
check = all(x == values[0] for x in values)
print(check)
if check == True:
print('Good all classes have the same number of examples')
else:
print('Repeat again your sampling your classes are not balanced')
indexes = np.arange(len(labels))
width = 0.5
plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels)
plt.show()
return data_train,labels_train
X_train,y_train=balanced_sample_maker(X,y,10)
inspired by Scikit-learn balanced subsampling

Pure numpy solution:
def sample(X, y, samples):
unique_ys = np.unique(y, axis=0)
result = []
for unique_y in unique_ys:
val_indices = np.argwhere(y==unique_y).flatten()
random_samples = np.random.choice(val_indices, samples, replace=False)
ret.append(X[random_samples])
return np.concatenate(result)

I usually use a trick from scikit-learn for this. I use the StratifiedShuffleSplit function. So if I have to select 1/n fraction of my train set, I divide the data into n folds and set the proportion of test data (test_size) as 1-1/n. Here is an example where I use only 1/10 of my data.
sp = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=seed)
for train_index, _ in sp.split(x_train, y_train):
x_train, y_train = x_train[train_index], y_train[train_index]

You can use dataframe as input (as in my case), and use simple code below:
col = target
nsamples = min(t4m[col].value_counts().values)
res = pd.DataFrame()
for val in t4m[col].unique():
t = t4m.loc[t4m[col] == val].sample(nsamples)
res = pd.concat([res, t], ignore_index=True).sample(frac=1)
col is the name of your column with classes. Code finds minority class, shuffles dataframe, then takes sample of size of minority class from each class.
Then you can convert result back to np.array

Related

Accuracy of KNN algorithm very low

I'm following sentdex's youtube channel ML tutorial.
So as I was coding along on how to build your own KNN algorithm, I noticed that my accuracy was very low, in the 60s almost every time. I had made a few changes, but then I used his code line by line, and the same dataset, yet somehow he gets accuracies in the range of 95-98%, while mine is 60-70%. I'm really not able to figure out the reason behind such a huge difference.
I also have a second question which has to do with the confidence of the predictions. The value of the confidence is supposed to be within 0-1 right? But for me, they're all identical, and in the 70s. Let me explain with a screenshot
My code:
# Importing libraries
import numpy as np
import pandas as pd
from collections import Counter
import warnings
import random
# Algorithm
def k_nearest(data,predict,k=5):
if len(data)>=k:
warnings.warn("stupid, your data has more dimensions than prescribed")
distances = []
for group in data: # The groups of 2s and 4s
for features in data[group]: # values in 2 and 4 respectively
#euclidean_distance = np.sqrt(np.sum((np.array(features) - np.sum(predict)) **2))
euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
distances.append([euclidean_distance,group])
votes = [i[1] for i in sorted(distances)] # adding the sorted(ascending) group names
votes_result = Counter(votes).most_common(1)[0][0] # the most common element
confidence = float((Counter(votes).most_common(1)[0][1]))/float(k)#ocuurences of the most common element
return votes_result,confidence
#reading the data
df = pd.read_csv("breast_cancer.txt")
df.replace("?",-99999,inplace=True)
#df.replace("?", np.nan,inplace=True)
#df.dropna(inplace=True)
df.drop("id",axis = 1,inplace=True)
full_data = df.astype(float).values.tolist() # Converting to list because our function is written like that
random.shuffle(full_data)
#print(full_data[:10])
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # Upto the last 20% of the og dateset
test_data = full_data[-int(test_size*len(full_data)):] # The last 20% of the dataset
# Populating the dictionary
for i in train_data:
train_set[i[-1]].append(i[:-1]) # appending with features and leaving out the label
for i in test_data:
test_set[i[-1]].append(i[:-1]) # appending with features and leaving out the label
# Testing
correct,total = 0,0
for group in test_set:
for data in test_set[group]:
vote,confidence = k_nearest(train_set, data,k=5)
if vote == group:
correct +=1
else:
print(confidence)
total += 1
print("Accuracy is",correct/total)
Link to the dataset breast-cancer-wisconsin.data

There's a mistake in your k_nearest function, you need to return only the top k of distances, not the whole list. So it should be:
votes = [i[1] for i in sorted(distances)[:k]]
Instead of in your code:
votes = [i[1] for i in sorted(distances)]
We can rewrite your function:
def k_nearest(data,predict,k=5):
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
distances.append([euclidean_distance,group])
votes = [i[1] for i in sorted(distances)[:k]]
votes_result = Counter(votes).most_common(1)[0][0]
confidence = float((Counter(votes).most_common(1)[0][1]))/float(k)
return votes_result,confidence
And run your code, I am not so sure about replacing "?" with -999 so I read it in as na :
import pandas as pd
from collections import Counter
import random
import numpy as np
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df = pd.read_csv(url,header=None,na_values="?")
df = df.dropna()
full_data = df.iloc[:,1:].astype(float).values.tolist()
random.seed(999)
random.shuffle(full_data)
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size*len(full_data)):]
for i in train_data:
train_set[i[-1]].append(i[:-1])
for i in test_data:
test_set[i[-1]].append(i[:-1])
correct,total = 0,0
for group in test_set:
for data in test_set[group]:
vote,confidence = k_nearest(train_set, data,k=5)
if vote == group:
correct +=1
else:
print(confidence)
total += 1
print("Accuracy is",correct/total)
Gives:
1.0
0.8
1.0
0.6
0.6
0.6
0.6
Accuracy is 0.9485294117647058

How to do Constrained Linear Regression - scikit learn?

I am trying to carry out linear regression subject using some constraints to get a certain prediction.
I want to make the model predicting half of the linear prediction, and the last half linear prediction near the last value in the first half using a very narrow range (using constraints) similar to a green line in figure.
The full code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None # default='warn'
data = [5.269, 5.346, 5.375, 5.482, 5.519, 5.57, 5.593999999999999, 5.627000000000001, 5.724, 5.818, 5.792999999999999, 5.817, 5.8389999999999995, 5.882000000000001, 5.92, 6.025, 6.064, 6.111000000000001, 6.1160000000000005, 6.138, 6.247000000000001, 6.279, 6.332000000000001, 6.3389999999999995, 6.3420000000000005, 6.412999999999999, 6.442, 6.519, 6.596, 6.603, 6.627999999999999, 6.76, 6.837000000000001, 6.781000000000001, 6.8260000000000005, 6.849, 6.875, 6.982, 7.018, 7.042000000000001, 7.068, 7.091, 7.204, 7.228, 7.261, 7.3420000000000005, 7.414, 7.44, 7.516, 7.542000000000001, 7.627000000000001, 7.667000000000001, 7.821000000000001, 7.792999999999999, 7.756, 7.871, 8.006, 8.078, 7.916, 7.974, 8.074, 8.119, 8.228, 7.976, 8.045, 8.312999999999999, 8.335, 8.388, 8.437999999999999, 8.456, 8.227, 8.266, 8.277999999999999, 8.289, 8.299, 8.318, 8.332, 8.34, 8.349, 8.36, 8.363999999999999, 8.368, 8.282, 8.283999999999999]
time = range(1,85,1)
x=int(0.7*len(data))
df = pd.DataFrame(list(zip(*[time, data])))
df.columns = ['time', 'data']
# print df
x=int(0.7*len(df))
train = df[:x]
valid = df[x:]
models = []
names = []
tr_x_ax = []
va_x_ax = []
pr_x_ax = []
tr_y_ax = []
va_y_ax = []
pr_y_ax = []
time_model = []
models.append(('LR', LinearRegression()))
for name, model in models:
x_train=df.iloc[:, 0][:x].values
y_train=df.iloc[:, 1][:x].values
x_valid=df.iloc[:, 0][x:].values
y_valid=df.iloc[:, 1][x:].values
model = LinearRegression()
# poly = PolynomialFeatures(5)
x_train= x_train.reshape(-1, 1)
y_train= y_train.reshape(-1, 1)
x_valid = x_valid.reshape(-1, 1)
y_valid = y_valid.reshape(-1, 1)
# model.fit(x_train,y_train)
model.fit(x_train,y_train.ravel())
# score = model.score(x_train,y_train.ravel())
# print 'score', score
preds = model.predict(x_valid)
tr_x_ax.extend(train['data'])
va_x_ax.extend(valid['data'])
pr_x_ax.extend(preds)
valid['Predictions'] = preds
valid.index = df[x:].index
train.index = df[:x].index
plt.figure(figsize=(5,5))
# plt.plot(train['data'],label='data')
# plt.plot(valid[['Close', 'Predictions']])
x = valid['data']
# print x
# plt.plot(valid['data'],label='validation')
plt.plot(valid['Predictions'],label='Predictions before',color='orange')
y =range(0,58)
y1 =range(58,84)
for index, item in enumerate(pr_x_ax):
if index >13:
pr_x_ax[index] = pr_x_ax[13]
pr_x_ax = list([float(i) for i in pr_x_ax])
va_x_ax = list([float(i) for i in va_x_ax])
tr_x_ax = list([float(i) for i in tr_x_ax])
plt.plot(y,tr_x_ax, label='train' , color='red', linewidth=2)
plt.plot(y1,va_x_ax, label='validation1' , color='blue', linewidth=2)
plt.plot(y1,pr_x_ax, label='Predictions after' , color='green', linewidth=2)
plt.xlabel("time")
plt.ylabel("data")
plt.xticks(rotation=45)
plt.legend()
plt.show()
If you see this figure:
label: Predictions before, the model predicted it without any constraints (I don't need this result).
label: Predictions after, the model predicted it within a constraint but this is after the model predicted AND the all values are equal to last value at index = 71 , item 8.56.
I used for loop for index, item in enumerate(pr_x_ax): in line:64, and the curve is line straight from time 71 to 85 sec as you see in order to show you how I need the model work.
Could I build the model give the same result instead of for loop???
Please your suggestions

I expect that in your question by drawing green line you really expect trained model to predict linear horizontal turn to the right. But current trained model draws just straight orange line.
It is true for any trained model of any algorithm and type that in order to learn some unordinary change in behavior model needs to have at least some samples of that unordinary change. Or at least some hidden meaning in observed data should point to having such unordinary change.
In other words for your model to learn that right turn on green line a model should have points with that right turn in the training data set. But you take for training data just first (leftmost) 70% of data by train = df[:int(0.7 * len(df))] and that training data has no such right turns and this training data just looks close to one straight line.
So you need to re-sample your data into training and validation in a different way - take randomly 70% of samples from whole range of X and the rest goes to validation. So that in your training data samples that do right turn also included.
Second thing is that LinearRegression model always models predictions just with one single straight line, and this line can't have right turns. In order to have right turns you need some more complex model.
One way for a model to have a right turn is to be piece-wise-linear, i.e. having several joined straight lines. I didn't find ready-made piecewise linear models inside sklearn, only using other pip models. So I decided to implement my own simple class PieceWiseLinearRegression that uses np.piecewise() and scipy.optimize.curve_fit() in order to model piecewise linear function.
Next picture shows results of applying two mentioned things above, code goes afterwards, re-sampling dataset in a different way and modeling piece-wise-linear function. Your current linear model LR still makes a prediction using just one straight blue line, while my piecewise linear PWLR2, orange line, consists of two segments and correctly predicts right turn:
To see clearly just one PWLR2 graph I did next picture too:
My class PieceWiseLinearRegression on creation of object accepts just one argument n - number of linear segments to be used for prediction. For picture above n = 2 was used.
import sys, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
np.random.seed(0)
class PieceWiseLinearRegression:
#classmethod
def nargs_func(cls, f, n):
return eval('lambda ' + ', '.join([f'a{i}'for i in range(n)]) + ': f(' + ', '.join([f'a{i}'for i in range(n)]) + ')', locals())
#classmethod
def piecewise_linear(cls, n):
condlist = lambda xs, xa: [(lambda x: (
(xs[i] <= x if i > 0 else np.full_like(x, True, dtype = np.bool_)) &
(x < xs[i + 1] if i < n - 1 else np.full_like(x, True, dtype = np.bool_))
))(xa) for i in range(n)]
funclist = lambda xs, ys: [(lambda i: (
lambda x: (
(x - xs[i]) * (ys[i + 1] - ys[i]) / (
(xs[i + 1] - xs[i]) if abs(xs[i + 1] - xs[i]) > 10 ** -7 else 10 ** -7 * (-1, 1)[xs[i + 1] - xs[i] >= 0]
) + ys[i]
)
))(j) for j in range(n)]
def f(x, *pargs):
assert len(pargs) == (n + 1) * 2, (n, pargs)
xs, ys = pargs[0::2], pargs[1::2]
xa = x.ravel().astype(np.float64)
ya = np.piecewise(x = xa, condlist = condlist(xs, xa), funclist = funclist(xs, ys)).ravel()
#print('xs', xs, 'ys', ys, 'xa', xa, 'ya', ya)
return ya
return cls.nargs_func(f, 1 + (n + 1) * 2)
def __init__(self, n):
self.n = n
self.f = self.piecewise_linear(self.n)
def fit(self, x, y):
from scipy import optimize
self.p, self.e = optimize.curve_fit(self.f, x, y, p0 = [j for i in range(self.n + 1) for j in (np.amin(x) + i * (np.amax(x) - np.amin(x)) / self.n, 1)])
#print('p', self.p)
def predict(self, x):
return self.f(x, *self.p)
data = [5.269, 5.346, 5.375, 5.482, 5.519, 5.57, 5.593999999999999, 5.627000000000001, 5.724, 5.818, 5.792999999999999, 5.817, 5.8389999999999995, 5.882000000000001, 5.92, 6.025, 6.064, 6.111000000000001, 6.1160000000000005, 6.138, 6.247000000000001, 6.279, 6.332000000000001, 6.3389999999999995, 6.3420000000000005, 6.412999999999999, 6.442, 6.519, 6.596, 6.603, 6.627999999999999, 6.76, 6.837000000000001, 6.781000000000001, 6.8260000000000005, 6.849, 6.875, 6.982, 7.018, 7.042000000000001, 7.068, 7.091, 7.204, 7.228, 7.261, 7.3420000000000005, 7.414, 7.44, 7.516, 7.542000000000001, 7.627000000000001, 7.667000000000001, 7.821000000000001, 7.792999999999999, 7.756, 7.871, 8.006, 8.078, 7.916, 7.974, 8.074, 8.119, 8.228, 7.976, 8.045, 8.312999999999999, 8.335, 8.388, 8.437999999999999, 8.456, 8.227, 8.266, 8.277999999999999, 8.289, 8.299, 8.318, 8.332, 8.34, 8.349, 8.36, 8.363999999999999, 8.368, 8.282, 8.283999999999999]
time = list(range(1, 85))
df = pd.DataFrame(list(zip(time, data)), columns = ['time', 'data'])
choose_train = np.random.uniform(size = (len(df),)) < 0.8
choose_valid = ~choose_train
x_all = df.iloc[:, 0].values
y_all = df.iloc[:, 1].values
x_train = df.iloc[:, 0][choose_train].values
y_train = df.iloc[:, 1][choose_train].values
x_valid = df.iloc[:, 0][choose_valid].values
y_valid = df.iloc[:, 1][choose_valid].values
x_all_lin = np.linspace(np.amin(x_all), np.amax(x_all), 500)
models = []
models.append(('LR', LinearRegression()))
models.append(('PWLR2', PieceWiseLinearRegression(2)))
for imodel, (name, model) in enumerate(models):
model.fit(x_train[:, None], y_train)
x_all_lin_pred = model.predict(x_all_lin[:, None])
plt.plot(x_all_lin, x_all_lin_pred, label = f'pred {name}')
plt.plot(x_train, y_train, label='train')
plt.plot(x_valid, y_valid, label='valid')
plt.xlabel('time')
plt.ylabel('data')
plt.legend()
plt.show()

my algorithm gives bad clusters while usingTF-IDF

im getting bad clusters i would like to rewrite it in a way where i can just plug in any algorithm that i would like (e.g hierarchical, knn, k-means) etc.
#takes in our text_extracts dictionary and returns clusters in an indexed list
def run_clustering(plan):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
max_df=0.5,
min_df=0.005,
ngram_range=(1,4),
lowercase=True)
#set the model with the vectorizer which will tokenize with our process_text function
extracts = {}
for page in plan.page_list:
if len(page.text_extract) > 50:
extracts[str(page.document_id) + '_' + str(page.page_number)] = page.text_extract
extract_lst = [extracts[text] for text in extracts]
tfidf_model = vectorizer.fit_transform(extract_lst)
#determine cluster number with silhouette coefficient
#start with 2 as a cluster size in case the set is very small
num_of_clusters_to_test = [2]
#going to test 25 more sizes in equal intervals based on the number of docs we are clustering
intervals_to_test = int(len(extracts) / 25)
#print(intervals_to_test)
num_of_clusters_to_test += [i for i in range(len(extracts)) if i % intervals_to_test == 0 and i != 0]
#these variables will help us determine the max silhouette
#iters_since_new_max is just being held so that if we aren't reaching optimal size for
#four iterations in a row, we dont have to keep testing huge cluster sizes
max_silhouette_coef = 0
iters_since_new_max = 0
good_size = 2
#cluster with a certain cluster size and record the silhouette coefficient
for size in num_of_clusters_to_test:
kmeans = KMeans(n_clusters=size).fit(tfidf_model)
label = kmeans.labels_
sil_coeff = silhouette_score(tfidf_model, label, metric='euclidean')
if sil_coeff > max_silhouette_coef:
max_silhouette_coef = sil_coeff
good_size = size
iters_since_new_max = 0
else:
iters_since_new_max += 1
if iters_since_new_max > 4:
break
# finally cluster for with the good size we want
km_model = KMeans(n_clusters=good_size)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
left as much comment as i can to help you all follow what i am going for can anyone help me improve this

You know KMeans if for numeric data only, right. I mean, don't expect it to work on labeled data. With KMeans, you calculate the distance to the nearest centroid (cluster center) and add this point to this cluster. What is the 'distance' between apple, banana, and watermelon? It doesn't make sense! So, just make sure you are running your KMeans over numerics.
import numpy as np
import pandas as pd
from pylab import plot,show
from numpy import vstack,array
from scipy.cluster.vq import kmeans,vq
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv('foo.csv')
# get only numeric fields from your dataframe
df = df.sample(frac=0.1, replace=True, random_state=1)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
for col in newdf.columns:
print(col)
# your independent variables
X = newdf[['NumericField1','NumericField2','NumericField3','list_price']]
# your dependent variable
y = newdf['DependentVariable']
# take all numeric features from the corr exercise, and turn into an array
# so we can feed it into a cluetering algorythm
data = np.asarray(newdf)
X = data
# computing K-Means with K = 100 (100 clusters)
centroids,_ = kmeans(data,100)
# assign each sample to a cluster
idx,_ = vq(data,centroids)
# some plotting using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
data[idx==1,0],data[idx==1,1],'oy',
data[idx==2,0],data[idx==2,1],'or',
data[idx==3,0],data[idx==3,1],'og',
data[idx==4,0],data[idx==4,1],'om')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
show()
details = [(name,cluster) for name, cluster in zip(df.brand,idx)]
for detail in details:
print(detail)
I've found Affinity Propogation to produce much tighter clusters than KMeans can achieve. Here is an example.
# Run Affinity Propogation Experiment
af = AffinityPropagation(preference=20).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
# plt.scatter(X[:, 0], X[:, 1], s=50)
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
Try these concepts and see how you get along.

My neural network only returns 0, 1 or other constant value

I'm trying to create a network, that would help predict stock prices the following day. My input data are: open, high, low and close stock values, volume, index values, a few technical indicators and exchange rate; the output is closing price from the next day. I'm using data uploaded from Excel file.
I wrote a program, that I will paste below, but it doesn't seem to be working correctly. Network always returns 1, 0 or other constant value (between 0 - 1).
I took the following steps so far:
tried to normalise the data like so: X_norm = X/(10 ** d) where d is the smallest number for which this conditon is met: abs(X_norm) < 1. I did that for the whole set in Excel before dividing it into training and test.
shuffled the data before dividing it into training/test, so that learning examples are not from consecutive days
running the network on a smaller data set and on example data set (I generated random numbers and did a simple math using them for an output and tried running network with that)
changing amount of hidden neurons
chaninging number of iterations (up to a 1000, which was a lot for my computer considering the data set, so I didn't try any more because it would take too much time)
changing learning rate.
No matter what steps I took the outcome was always the same. I think my problem could be that I don't have a bias, but perhaps I also have other mistakes in my code that are contributing to this error.
My program:
import numpy as np
import pandas as pd
df = pd.read_excel(r"path", sheet_name="DATA", index_col=0, header=0)
df = df.to_numpy()
np.random.shuffle(df)
X_data = df[:, 0:15]
X_data = X_data.reshape(1000, 1, 15)
print(f"X_data: {X_data}")
Y_data = df[:, 15]
Y_data = Y_data.reshape(1000, 1, 1)
print(f"Y_data: {Y_data}")
X = X_data[0:801]
x_test = X_data[801:]
y = Y_data[0:801]
y_test = Y_data[801:]
print(f"X_train: {X}")
print(f"x_test: {x_test}")
print(f"Y_train: {y}")
print(f"y_test: {y_test}")
rate = 0.2
class NeuralNetwork:
def __init__(self):
self.input_neurons = 15
self.hidden1_neurons = 10
self.hidden2_neurons = 5
self.output_neuron = 1
self.input_to_hidden1_w = (np.random.random((self.input_neurons, self.hidden1_neurons))) # 14x30
self.hidden1_to_hidden2_w = (np.random.random((self.hidden1_neurons, self.hidden2_neurons))) # 30x20
self.hidden2_to_output_w = (np.random.random((self.hidden2_neurons, self.output_neuron))) # 20x1
def activation(self, x):
sigmoid = 1/(1+np.exp(-x))
return sigmoid
def activation_d(self, x):
derivative = x * (1 - x)
return derivative
def feed_forward(self, X):
self.z1 = np.dot(X, self.input_to_hidden1_w)
self.z1_a = self.activation(self.z1)
self.z2 = np.dot(self.z1_a, self.hidden1_to_hidden2_w)
self.z2_a = self.activation(self.z2)
self.z3 = np.dot(self.z2_a, self.hidden2_to_output_w)
output = self.activation(self.z3)
return output
def backward(self, X, y, rate, output):
error = y - output
z3_error_delta = error * self.activation_d(output)
z2_error = np.dot(z3_error_delta, np.transpose(self.hidden2_to_output_w))
z2_error_delta = z2_error * self.activation_d(self.z2)
z1_error = np.dot(z2_error_delta, np.transpose(self.hidden1_to_hidden2_w))
z1_error_delta = z1_error * self.activation_d(self.z1)
self.input_to_hidden1_w += rate * np.dot(np.transpose(X), z1_error_delta)
self.hidden1_to_hidden2_w += rate * np.dot(np.transpose(self.z1), z2_error_delta)
self.hidden2_to_output_w += rate * np.dot(np.transpose(self.z2), z3_error_delta)
def train(self, X, y):
output = self.feed_forward(X)
self.backward(X, y, rate, output)
def save_weights(self):
np.savetxt("w1.txt", self.input_to_hidden1_w, fmt="%s")
np.savetxt("w2.txt", self.hidden1_to_hidden2_w, fmt="%s")
np.savetxt("w3.txt", self.hidden2_to_output_w, fmt="%s")
def check(self, x_test, y_test):
self.feed_forward(x_test)
np.mean(np.square((y_test - self.feed_forward(x_test))))
Net = NeuralNetwork()
for l in range(100):
for i, pattern in enumerate(X):
for j, outcome in enumerate(y):
print(f"#: {l}")
print(f'''
# {str(l)}
# {str(X[i])}
# {str(y[j])}''')
print(f"Predicted output: {Net.feed_forward(X[i])}")
Net.train(X[i], y[j])
print(f"Error training: {(np.mean(np.square(y - Net.feed_forward(X))))}")
Net.save_weights()
for i, pattern in enumerate(x_test):
for j, outcome in enumerate(y_test):
Net.check(x_test[i], y_test[j])
print(f"Error test: {(np.mean(np.square(y_test - Net.feed_forward(x_test))))}")

KNN distance and class vote

Can you please tell me how to calculate distance between every point in my testData properly.
For now I am getting only one single value, whereas I should get distance from each point in data set and be able to assign it a class. I have to use numpy for this.
========================================================================
Now the problem is that I am getting this error and don't know how to fix it.
KeyError: 0
I am trying to obtain accuracy of classified labels.
Any ideas, please?
import matplotlib.pyplot as plt
import random
import numpy as np
import operator
from sklearn.cross_validation import train_test_split
# In[1]
def readFile():
f = open('iris.data', 'r')
d = np.dtype([ ('features',np.float,(4,)),('class',np.str_,20)])
data = np.genfromtxt(f, dtype = d ,delimiter=",")
dataPoints = data['features']
labels = data['class']
return dataPoints, labels
# In[2]
def normalizeData(dataPoints):
#normalize the data so the values will be between 0 and 1
dataPointsNorm = (dataPoints - dataPoints.min())/(dataPoints.max() - dataPoints.min())
return dataPointsNorm
def crossVal(dataPointsNorm):
# spliting for train and test set for crossvalidation
trainData, testData = train_test_split(dataPointsNorm, test_size=0.20, random_state=25)
return trainData, testData
def calculateDistance(trainData, testData):
#Euclidean distance calculation on numpy arrays
distance = np.sqrt(np.sum((trainData - testData)**2, axis=-1))
# Argsort sorts indices from closest to furthest neighbor, in ascending order
sortDistance = distance.argsort()
return distance, sortDistance
# In[4]
def classifyKnn(testData, trainData, labels, k):
# Calculating nearest neighbours and based on majority vote assigning the class
classCount = {}
for i in range(k):
distance, sortedDistIndices = calculateDistance(trainData, testData[i])
voteLabel = labels[sortedDistIndices][i]
#print voteLabel
classCount[voteLabel] = classCount.get(voteLabel,0)+1
print 'Class Count: ', classCount
# Sorting dictionary to return voted class
sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0], classCount
def testAccuracy(testData, classCount):
correct = 0
for x in range(len(testData)):
print 'HERE !!!!!!!!!!!!!!'
if testData[x][-1] is classCount[x]:
correct += 1
return (correct/float(len(testData))) * 100.0
def main():
dataPoints, labels = readFile()
dataPointsNorm = normalizeData(dataPoints)
trainData, testData = crossVal(dataPointsNorm)
result, classCount = classifyKnn(testData, trainData, labels, 5)
print result
accuracy = testAccuracy(testData, classCount)
print accuracy
main()
I have it normalized, split into train and test calc distance (wrong).
Thanks for any tips.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

sample X examples from each class label - python

Pure numpy solution: def sample(X, y, samples): unique_ys = np.unique(y, axis=0) result = [] for unique_y in unique_ys: val_indices = np.argwhere(y==unique_y).flatten() random_samples = np.random.choice(val_indices, samples, replace=False) ret.append(X[random_samples]) return np.concatenate(result)

Related

Accuracy of KNN algorithm very low

How to do Constrained Linear Regression - scikit learn?

my algorithm gives bad clusters while usingTF-IDF

My neural network only returns 0, 1 or other constant value

KNN distance and class vote

Categories

Resources