Simple Neural Network with MNIST stuck at 17% missclasification error - python

Hello Im putting my code for inspection because Ive been playing with my neural network implementation in python for a few weeks and I cant seem to reach a misclasification error below 17% some times 16%. ve been trying different learning rate values, different hidden neurons number and still not a lot of mprovement. Im well aware my implementation is the basic traditional neural network, but I was expecting better results according to other implementations ive seen in internet. I hope this is of interest for you guys, it would be really cool if you could point me new ideas about what could be the problem in my code, or maybe you think this is the best I can do with a traditional implementation and I should add something new, that would be cool too.
In any case, here is my code, I hope is readable enough, I tried to do it as simple as possible, since its my way to understand how neural networks work.
Edit:Perhaps my question is not so clear, basically I would like, if its of interest for you guys, is to help me find details in my current implementation that could improve my misclasification error below 17% because apparently its the best my implementation can do. I would be very thankful for any advise or idea, Im deeply interested in this topic, but Im a beginner and it would be great to have some smart ideas that can help me improve my implementation.
File: mnist_dataset.py - Extract the mnist data
import numpy as np
from struct import unpack
train_input_file = open("dataset/train-images-idx3-ubyte", "rb")
train_output_file = open("/dataset/train-labels-idx1-ubyte", "rb")
test_input_file = open("dataset/t10k-images-idx3-ubyte", "rb")
test_output_file = open("dataset/t10k-labels-idx1-ubyte", "rb")
def readData(f,labels = False,scale = 1):
header = hex(unpack('>L',np.fromfile(f,dtype=np.int32,count=1)[0])[0])
num = int(unpack('>L',np.fromfile(f,dtype=np.int32,count=1)[0])[0])
col = 1
row = 1
if labels == False:
row = int(unpack('>L',np.fromfile(f,dtype=np.int32,count=1)[0])[0])
col = int(unpack('>L',np.fromfile(f,dtype=np.int32,count=1)[0])[0])
data = np.zeros((int(num/scale),col*row))
for i in range(0,int(num/scale),1):
data[i] = np.fromfile(f,dtype=np.ubyte,count=col*row)
return data
def getMNISTData():
def norm(v):
return v/255
train_input = readData(train_input_file, scale=1)/255.0
train_out = readData(train_output_file, True,scale=1)
test_input = readData(test_input_file)/255.0
test_out = readData(test_output_file, True)
print "Train input: " + str(train_input.shape)
print "Train output: " + str(train_out.shape)
print "Test input: " + str(test_input.shape)
print "Test output: " + str(test_out.shape)
train_input_file.close()
train_output_file.close()
test_input_file.close()
test_output_file.close()
return (train_input,train_out,test_input,test_out)
File: NN.py - neural network implementation
import mnist_dataset
import numpy as np
import random
import matplotlib.pyplot as plt
def encode_data_10(v):
e = (0.0) * np.ones((1, 10), dtype=float)
e[:, int(v)] = 1.0
return e.tolist()
def encode_data_1(v):
n = -1.0 + ((0.2)*v)
return n
x_train, y_train, x_test, y_test = mnist_dataset.getMNISTData()
learning_rate = 1.0
iter = 3000
sample_size = 30
num_hidden_neurons = 500
num_output_neurons = 10
if num_output_neurons > 1:
y_train = np.matrix(np.array(map(encode_data_10,y_train)))
y_test = np.matrix(np.array(map(encode_data_10,y_test)))
else:
y_train = np.matrix(map(encode_data_1,y_train))
y_test = np.matrix(map(encode_data_1,y_test))
def getSample(sample_size,x,y):
r = random.sample(xrange(1, len(y), 1), sample_size)
x_r = np.zeros((sample_size,x.shape[1]))
y_r = np.zeros((sample_size,y.shape[1]))
for i,n in enumerate(r):
x_r[i] = x[n]
y_r[i] = y[n]
return (x_r,y_r)
inputVector, targetVector = getSample(sample_size, x_train, y_train)
hiddenWeights = np.mat(np.random.random((num_hidden_neurons, x_train.shape[1])))
print "W0 shape: " + str(hiddenWeights.shape)
outputWeights = np.mat(np.random.random((num_output_neurons,num_hidden_neurons)))
print "W1 shape: " + str(outputWeights.shape)
def act_func_l1(a):
return (1.0/(1 + np.exp(-a)))
def der_act_func_l1(a):
return act_func_l1(a)*(1.0 - act_func_l1(a))
def feedforward(l0):
global hiddenWeights
global outputWeights
Z1 = l0 * hiddenWeights.T
layer1 = np.matrix(act_func_l1(np.asarray(Z1)))
Z2 = layer1 * outputWeights.T
layer2 = act_func_l1(np.asarray(Z2))
return (layer1,layer2)
def miss(x,y):
layer1, layer2 = feedforward(x)
def c(n):
if n > 0.5:
return 1.0
else:
return 0.0
layer2 = map(lambda v: map(c, v), layer2)
def cc(t):
return np.abs(cmp(np.array(y[t[0]]).tolist()[0], np.array(t[1]).tolist()))
return (np.sum(map(cc, enumerate(layer2))))
miss_x = np.zeros((iter, 1))
for j in xrange(iter):
hiddenActualInput = inputVector * hiddenWeights.T
hiddenOutputVector = np.matrix(act_func_l1(np.asarray(hiddenActualInput)))
outputActualInput = hiddenOutputVector * outputWeights.T
outputVector = act_func_l1(np.asarray(outputActualInput))
layer2_error2 = np.square(outputVector - targetVector)
print "Error: " + str(np.mean(np.abs(layer2_error2)))
m = miss(x_test,y_test)
miss_x[j] = m
print str(j) + " - Misses (%): " + str(m)
if m <= 2000:
learning_rate = 0.05
outputDelta = np.mat(der_act_func_l1(np.asarray(outputVector))*np.asarray(outputVector - targetVector))
hiddenDelta = np.mat(der_act_func_l1(np.asarray(hiddenOutputVector)) * np.asarray((outputDelta*outputWeights)))
hiddenWeights = np.mat(hiddenWeights.T - (learning_rate*np.asarray(inputVector.T*hiddenDelta))).T
outputWeights = np.mat(outputWeights.T - (learning_rate*np.asarray(hiddenOutputVector.T*outputDelta))).T
inputVector, targetVector = getSample(sample_size, x_train, y_train)
plt.plot(xrange(iter), miss_x, label = 'Miss rate(%)')
plt.legend(loc='upper right')
plt.show()

Related

Increasing validation loss from the very beginning

I've been doing a very simply binary cat/dog classification project with machine learning. I understand the problem of overfitting, but what's strange in my case is that the validation loss begins to rise from the very beginning. I've tried many different sets of hyperparameters, with L2 regularization, learning rate decay and stochastic gradient descent, and a large training set, but the issue remained. Here is the learning graph from one of the trials (the horizontal axis should be per 10 epochs):
The hyperparameters are: two hidden layers with 50 and 10 units, initial alpha = 0.05, alpha decay rate = 0.95 per 50 epochs, mini-batch size = 64, lambda = 0.05
Here are other sample learning graphs:
I developed my model on the basis of what's provided in Andrew Ng's Deep Learning Specialization, so I didn't expect many bugs. My full code, as required, is attached below:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set, which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path, category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal, img), cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array, (img_size, img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array, categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1, np.array(Y_train).shape[0]))
def create_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:, mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:, mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X, select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:, mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:, mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X, last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer), in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z, Z)
return A,Z
def forward_propagation(X, parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W, A_prev) + b
A, activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev, W, b) #linear_cache contains A[l-1], W[l], b[l].
cache = (linear_cache, activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W, A) + b
AL, activation_cache = sigmoid(Z)
linear_cache = (A, W, b)
cache = (linear_cache, activation_cache)
caches.append(cache)
return AL, caches
def compute_cost(AL, Y, parameters, lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ, linear_cache, lambd):
A_prev, W, b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
def relu_gradient(Z):
dZ = np.where(Z > 0, 1, 0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA, cache, lambd, A, Y, activation):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches, lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(_, cache_final_layer, lambd, AL, Y, activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)], current_cache, lambd, _, _, activation='relu')
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, layers_dims, learning_rate, num_epoch, mini_batch_size, lambd, k):
mini_batches = create_mini_batches(X_train, Y_train, mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X, minibatch_Y) = mini_batch
AL, caches = forward_propagation(minibatch_X, parameters)
J_train = compute_cost(AL, minibatch_Y, parameters, lambd)
grads = L_model_backward(AL, minibatch_Y, caches, lambd)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i, J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters, costs_train, costs_dev
parameters_updated, costs_train, costs_dev = Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, [6400, 50, 10, 1], 0.05, 1000, 64, 0.05, 0.95)
I would really be grateful for anyone who is patient enough to read through my code. If the problem is still overfitting, could you offer some advice as to how to address this issue? I'm at a loss here because the validation loss goes up at a very early stage, so early stopping would cause underfitting by preventing the model from learning more deeply. Any advice would be appreciated.
When Validation Loss starts to increase in early beginning like images you added, it means that there's there is something wrong in the model.
It's not clear what's it as you didn't show your model.
You could check the following links that will help you:
Basic Cats vs Dogs Detailed Example in Colab
Detailed explanation for Over-fitting in TF Tutorial
or add your full code

How to fix broken data in feature extraction/pre-processing in speech recognition?

i am very new in machine learning. I stumble on this source code on github that has no database, so i decided to use my own database. This code is to recognize speaker with MFCC and GMM-UBM. But when i try to run the code, i got this error "ValueError: Found array with 1 sample(s) (shape=(1, 13)) while a minimum of 2 is required". It seems like when the code is trying to fit the GMM on the 68th dataset, the MFCC shape of the data is broken. I assume there's something wrong on the feature extraction process.
Please help me! thank you very much.
Here's the code
import python_speech_features as psf
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib
from scipy.io import wavfile
from functools import reduce
import numpy as np
from os import listdir
from os.path import isfile, join
import os
import re
DATA_PATH = 'dataCoba'
# Make a list of speakers from the newdata/data folder. The format for the files in the folder is
# name_1,wav for training and name_2.wav for testing
substring = "_2"
onlyfiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]
onlyfiles.sort()
onlyones = []
for filename in onlyfiles:
dups = re.search('[\w]+_2.wav', filename)
#dups = re.search('[\w].wav', filename)
if dups is None:
onlyones.append(''.join(filename.split('_')[0]))
print(onlyones)
SPEAKERS = onlyones
TOTAL_SPEAKERS = len(SPEAKERS)
MODEL_SPEAKERS = len(SPEAKERS)
print(len(SPEAKERS))
class SpeakerRecognition:
# Create a GMM and UBM model for each speaker. The GMM is modelled after the speaker and UBM for each speaker
# is modelled after all the other speakers. Likelihood Ratio test is used to verify speaker
def setGMMUBM(self, no_components):
self.GMM = []
self.UBM = []
for i in range(MODEL_SPEAKERS):
self.GMM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
self.UBM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
# Load in data from .wav files in data/
# Extract mfcc (first 13 coefficients) from each audio sample
def load_data(self):
#training
self.spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_1.wav') for i in SPEAKERS]
self.spk_mfcc = [psf.mfcc(self.spk[i][1], self.spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
#testing
self.p_spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_2.wav') for i in SPEAKERS]
self.p_spk_mfcc = [psf.mfcc(self.p_spk[i][1], self.p_spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
print(self.spk_mfcc)
for i in range(TOTAL_SPEAKERS):
self.spk_train_size.append(len(self.spk_mfcc[i]))
self.spk_start.append(len(self.total_mfcc))
print("Speaker Number(train) = ",i)
print ("self.spk_mfcc[i] = ", len(self.spk_mfcc[i]))
for mfcc in self.spk_mfcc[i]:
self.total_mfcc.append(mfcc)
self.speaker_label.append(i)
self.spk_end.append(len(self.total_mfcc))
print("self.total_mfcc = ", len(self.total_mfcc))
print("\n")
for i in range(TOTAL_SPEAKERS):
#print("self.p_spk_mfcc =", self.p_spk_mfcc)
self.spk_test_size.append(len(self.p_spk_mfcc[i]))
self.spk_start.append(len(self.p_total_mfcc))
print("Speaker Num(test) = ",i)
print("self.p_spk_mfcc = ",len(self.p_spk_mfcc[i]))
print("MFCC Shape = ",self.spk_mfcc[i].shape)
for mfcc in self.p_spk_mfcc[i]:
self.p_total_mfcc.append(mfcc)
self.p_speaker_label.append(i)
self.p_spk_end.append(len(self.p_total_mfcc))
print("self.total_mfcc = ", len(self.p_total_mfcc))
print("\n")
# Gaussian Mixture Model is made of a number of Gaussian distribution components.
# To model data, a suitable number o gaussian components have to be selected.
# There is no method for finding this. It is done by trial and error. This runs
# the program for different values of component and records accuracy for each one
[![This is the error when i run the code][1]][1]
def find_best_params(self):
best_no_components = 1
maxacc = 0
for i in range(100, 256):
self.setGMMUBM(i)
self.fit_model()
_, acc, _ = self.predict()
print("Accuracy for n = {} is {}".format(i, acc))
if acc > maxacc:
maxacc = acc
best_no_components = i
return best_no_components
# Fit the GMM UBM models with training data
# fit = N buah data * dimensi data
def fit_model(self):
for i in range(MODEL_SPEAKERS):
print("Fit start for {}".format(i))
self.GMM[i].fit(self.spk_mfcc[i])
print(self.spk_mfcc[i].shape)
self.UBM[i].fit(self.total_mfcc[:self.spk_start[i]] + self.total_mfcc[self.spk_end[i]:])
print("Fit end for {}".format(i))
joblib.dump(self.UBM[i], 'dumps/new/ubm' + str(i) + '.pkl')
joblib.dump(self.GMM[i], 'dumps/new/gmm' + str(i) + '.pkl')
def model(self, no_components = 244):
self.setGMMUBM(no_components)
self.fit_model()
# Predict the output for each model for each speaker and produce confusion matrix
def load_model(self):
for i in range(0, MODEL_SPEAKERS):
self.GMM.append(joblib.load('dumps/new/gmm' + str(i) + '.pkl'))
self.UBM.append(joblib.load('dumps/new/ubm' + str(i) + '.pkl'))
def predict(self):
avg_accuracy = 0
confusion = [[ 0 for y in range(MODEL_SPEAKERS) ] for x in range(TOTAL_SPEAKERS)]
for i in range(TOTAL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
x = self.GMM[j].score_samples(self.p_spk_mfcc[i]) - self.UBM[j].score_samples(self.p_spk_mfcc[i])
for score in x :
if score > 0:
confusion[i][j] += 1
confusion_diag = [confusion[i][i] for i in range(MODEL_SPEAKERS)]
diag_sum = 0
for item in confusion_diag:
diag_sum += item
remain_sum = 0
for i in range(MODEL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
if i != j:
remain_sum += confusion[i][j]
spk_accuracy = 0
for i in range(MODEL_SPEAKERS):
best_guess, _ = max(enumerate(confusion[i]), key=lambda p: p[1])
print("For Accent {}, best guess is {}".format(SPEAKERS[i], SPEAKERS[best_guess]))
if i == best_guess:
spk_accuracy += 1
#print(MODEL_SPEAKERS)
spk_accuracy /= MODEL_SPEAKERS
avg_accuracy = diag_sum/(remain_sum+diag_sum)
return confusion, avg_accuracy, spk_accuracy
def __init__(self):
self.test_spk = []
self.test_mfcc = []
# Speaker data and corresponding mfcc
self.spk = []
self.spk_mfcc = []
self.p_spk = []
self.p_spk_mfcc = []
# Holds all the training mfccs of all speakers and
# speaker_label is the speaker label for the corresponding mfcc
self.total_mfcc = []
self.speaker_label = []
self.spk_train_size = [] # Index upto which is training data for that speaker.
self.p_total_mfcc = []
self.p_speaker_label = []
#print(self.p_speaker_label)
self.spk_test_size = []
# Since the length of all the audio files are different, spk_start and spk_end hold
self.spk_start = []
self.spk_end = []
self.p_spk_start = []
self.p_spk_end = []
self.GMM = []
self.UBM = []
self.load_data()
self.cepstral_mean_subtraction()
# Cepstral Mean Subtraction (Feature Normalization step)
def cepstral_mean_subtraction(self):
for i, speaker_mfcc in enumerate(self.spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x/len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.spk_mfcc[i][j][k] -= average[k]
for i, speaker_mfcc in enumerate(self.p_spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x / len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.p_spk_mfcc[i][j][k] -= average[k]
#TBD : Ten fold validation
def ten_fold():
#fold_size = 0.1 * self.n
fold_offset = 0.0
accuracy_per_fold = 0
average_accuracy = 0
for i in range(0, 10):
print("Fold start is {} and fold end is {} ".format( fold_offset, fold_offset + fold_size))
#accuracy = self.execute(int(fold_offset), int(fold_offset + fold_size))
#print("Accuracy is of test {} is : {} ".format(i, accuracy))
#average_accuracy += accuracy
#fold_offset += fold_size
average_accuracy /= 10.0
print("Average accuracy " + str(100 * average_accuracy))
return average_accuracy
# Final result is a confusion matrix which represents the accuracy of the fit of the model
if __name__ == '__main__':
SR = SpeakerRecognition()
#SR.load_model()
SR.setGMMUBM(no_components=13)
#SR.find_best_params()
SR.fit_model()
confusion, mfcc_accuracy, spk_accuracy = SR.predict()
print("Confusion Matrix")
print(np.matrix(confusion))
print("Accuracy in predicting speakers : {}".format(spk_accuracy))
print("Accuracy in testing for MFCC : {}".format(mfcc_accuracy))

What is the next step when having completed first neural network? [closed]

Closed. This question is opinion-based. It is not currently accepting answers.
Want to improve this question? Update the question so it can be answered with facts and citations by editing this post.
Closed 3 years ago.
Improve this question
I'm a big fan of the youtube channel 3Blue1Brown and his series on Neural networks really got me excited on the subject.
I decided to create my own neural network in python from scratch engaging deeply in the mathematics. So with the help from the MNIST database on hand-written numbers I got started and succeded with the task after 2 weeks.
I have since then been further developing my code so that I can adjust the number of neurons and hidden layers neatly within the code.
I also experimented with different activation functions.
The best accuracy I've gotten is about 95% with 2 hidden layers of 16 neurons and 5 minutes of training.
Now, my question is fairly vague but I am now looking for the next challenge within the area, do you guys have any suggestions?
I now have the framework set up so I'd love some new type of problem with a bigger dataset or something or maybe should I work more on my existing problem to increase the accuracy of the ouput further?
What do you guys think?
Yours,
Emil
(Here's the code if anyone is interested)
import pickle
import gzip
import numpy as np
import random
import time
import pickle
import gzip
import numpy as np
import random
import time
class mnistClass:
def __init__(self, inputAmount=784, layers=2, layerSize=16, outputSize=10, loops=1, sampleSize=100):
with gzip.open('mnist.pkl.gz', 'rb') as f:
train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
self.A, self.y = train_set
self.V, self.v2 = valid_set
self.dataSize = len(self.A)
self.inputAmount = inputAmount
self.layers = layers
self.layerSize = layerSize
self.outputSize = outputSize
self.loops = loops
self.sampleSize = sampleSize
self.iterations = int(self.dataSize/self.sampleSize)
self.clock = time.time()
self.Weights = []
self.Biases = []
self.initializeArrays()
self.initializeTraining()
print("Accuracy: " + str(self.getAccuracy()) + "%")
def initializeArrays(self):
for i in range(self.layers):
if self.layers - i > 2: #Adding middle layers
self.Weights.append(np.random.rand(self.layerSize, self.layerSize)-0.5)
if self.layers - i > 1:
self.Biases.append(np.random.rand(self.layerSize)-0.5)
if self.layers > 1:
self.Weights.insert(0, np.random.rand(self.layerSize, self.inputAmount)-0.5)
self.Weights.insert(len(self.Weights), np.random.rand(self.outputSize, self.layerSize)-0.5)
else:
self.Weights.insert(len(self.Weights), np.random.rand(self.outputSize, self.inputAmount)-0.5)
self.Biases.insert(len(self.Biases), np.random.rand(self.outputSize)-0.5)
def sigmoid(self, x, shiftType):
if shiftType == 0:
result = 1/(1+np.exp(-x))
elif shiftType == 1:
result = 2 * (1/(1+np.exp(-x))) - 1
return result
def sigmoidPrime(self, x, shiftType):
if shiftType == 0:
result = self.sigmoid(x, 0) - self.sigmoid(x, 0)**2
elif shiftType == 1:
result = 2*np.exp(-x)/(1+np.exp(-x))**2
return result
def Rdependance(self, Z, layer1, layer2, multi=False): #How R depends on a preceeding R
multi = layer1-layer2 > 1
if not multi:
if layer1 == self.layers-1:
shiftType = 0
else:
shiftType = 1
R1_R2_differential = np.multiply(self.Weights[layer1], self.sigmoidPrime(Z[layer1]+self.Biases[layer1], shiftType)[:, np.newaxis])
result = R1_R2_differential
else:
chainRule = []
for i in reversed(range(layer2, layer1)):
chainRule.append(self.Rdependance(Z, i+1, i))
result = chainRule[0]
for i in range(len(chainRule)-1):
result = np.dot(result, chainRule[i+1])
return result
def RWdependance(self, R, Z, dataCaseNo, layer): #How R depends on connecting Weights
if layer == self.layers-1:
shiftType = 0
else:
shiftType = 1
R_W_differential = self.Weights[layer]/self.Weights[layer]
mergeW_Z = np.multiply(R_W_differential, self.sigmoidPrime(Z[layer]+self.Biases[layer], shiftType)[:, np.newaxis])
if layer == 0:
R_W_differential = np.multiply(mergeW_Z.T, self.A[dataCaseNo][:, np.newaxis]).T
else:
R_W_differential = np.multiply(mergeW_Z.T, R[layer-1][:, np.newaxis]).T
return R_W_differential
def RBdependance(self, Z, layer): #How R depends on internal Biases
if layer == self.layers-1:
shiftType = 0
else:
shiftType = 1
R_B_differential = np.multiply(self.Rdependance(Z, self.layers-1, layer).T, self.sigmoidPrime(Z[layer]+self.Biases[layer], shiftType)[:, np.newaxis]).T
return R_B_differential
def integralWeightCost(self, R, Z, dataCaseNo, quadDifferential, layer): # Cost of system for weights
if layer == self.layers-1:
nodes = np.identity(self.outputSize)
else:
nodes = self.Rdependance(Z, self.layers-1, layer)
cost_differential = np.multiply(nodes, quadDifferential[:, np.newaxis])
cost_differential = np.sum(cost_differential, 0)
result = np.multiply(self.RWdependance(R, Z, dataCaseNo, layer), cost_differential[:, np.newaxis])
return result
def integralBiasCost(self, Z, quadDifferential, layer): # Cost of system for biases
if layer == self.layers-1:
nodes = np.identity(self.outputSize)
else:
nodes = self.RBdependance(Z, layer)
cost_differential = np.multiply(nodes, quadDifferential[:, np.newaxis])
result = np.sum(cost_differential, 0)
return result
def initializeTraining(self):
for loop in range(self.loops):
for iteration in range(self.iterations):
avg_cost = 0
avg_deltaWeights = []
avg_deltaBiases = []
for i in range(len(self.Weights)): #Creating zeros of weight arrays
avg_deltaWeights.append(self.Weights[i]*0)
for i in range(len(self.Biases)):
avg_deltaBiases.append(self.Biases[i]*0)
for dataCaseNo in range(iteration*self.sampleSize, iteration*self.sampleSize + self.sampleSize):
if self.layers == 1:
shiftType = 0
else:
shiftType = 1
Y1 = np.zeros(self.outputSize)
Y1[self.y[dataCaseNo]] = 1
Z = []
Z.append(np.dot(self.Weights[0], self.A[dataCaseNo]))
R = []
R.append(self.sigmoid(Z[0]+self.Biases[0], shiftType))
for i in range(1, self.layers):
if i == self.layers-1:
shiftType = 0
else:
shiftType = 1
Z.append(np.dot(self.Weights[i], R[i-1]))
R.append(self.sigmoid(Z[i]+self.Biases[i], shiftType))
C = np.sum((R[-1] - Y1)**2)
avg_cost += C
quadDifferential = 2 * (R[-1]-Y1)
for i in range(self.layers):
avg_deltaWeights[i] += self.integralWeightCost(R, Z, dataCaseNo, quadDifferential, i)
avg_deltaBiases[i] += self.integralBiasCost(Z, quadDifferential, i)
avg_cost = avg_cost/self.sampleSize
for i in range(self.layers):
self.Weights[i] = self.Weights[i] - avg_deltaWeights[i]/self.sampleSize
self.Biases[i] = self.Biases[i] - avg_deltaBiases[i]/self.sampleSize
print("Average cost: " + str(round(avg_cost, 4)))
print("\n" + "*"*25 + " " + str(loop+1) +" " + "*"*25 + "\n")
executionEndTime = round((time.time() - self.clock), 2)
print("Completed " + str(self.loops) + " rounds of " + str(self.sampleSize*self.iterations) + " samples (sampleSize: " + str(self.sampleSize) + "), " + " in " + str(executionEndTime) + " seconds..")
print("Layers: " + str(self.layers))
print("Middle layer nodes: " + str(self.layerSize))
print("Input amount: " + str(self.inputAmount))
amountVariables = 0
for i in range(self.layers):
amountVariables += self.Weights[i].size
amountVariables += self.Biases[i].size
print("Variables: " + str(amountVariables))
print("Output size: " + str(self.outputSize))
time.sleep(2)
def getAccuracy(self):
runs = 10000
correct = 0
print("Testing validation set accuracy over " + str(runs) + " samples...\n")
for i in range(runs):
if self.layers == 1:
shiftType = 0
else:
shiftType = 1
ran = i
Y1 = np.zeros(self.outputSize)
Y1[self.v2[ran]] = 1
Z = []
Z.append(np.dot(self.Weights[0], self.V[ran]))
R = []
R.append(self.sigmoid(Z[0]+self.Biases[0], shiftType))
for i in range(1, self.layers):
if i == self.layers-1:
shiftType = 0
else:
shiftType = 1
Z.append(np.dot(self.Weights[i], R[i-1]))
R.append(self.sigmoid(Z[i]+self.Biases[i], shiftType))
result = np.where(R[-1] == np.amax(R[-1]))
maxNum = result[0][0]
if int(self.v2[ran]) == int(maxNum):
correct += 1
accuracy = correct*100/runs
return accuracy
instance = mnistClass(784, 3, 16, 10, 2, 100)
#(input, layers, layer size, output, loops, sample subsize)
#input - amount of nodes in input data
#layers - amount of layers including last output layer but not first input layer
#layer size - amount of nodes in hidden layers
#output - amount of nodes in output layer
#loops - how many times to train through the entire data set
#sample subsize - what quantity of data samples to average the gradient on
I'm so glad to hear about new faces joining the field of ML (specifically DL),
That's quite an accomplishment what you said you've achieved so first of all salute.
Now as for your question, I'd suggest you take a step back and understand the concept of data exploration, and features extraction, and why those are important and how I suggest you do it is by exploring some kaggle tutorials about machine learning, trying to do some basic classification of data sets from there like the titanic data set etc...
https://www.kaggle.com/learn/overview
go for the "into to machine learning".
Best of luck!

Build prediction model from tensors in tensorflow without inputing data

I am trying to interface CasADi and Tensorflow. CasADi is a toolbox that uses symbolic variables and does automatic differentiation. It is often used for dynamic/static optimization problems.
I found an example where GPflow is used (https://web.casadi.org/blog/tensorflow/). In this case, the GP model is firstly trained with data as follows
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))
model = gpflow.models.GPR(data, value, gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
gpflow.train.ScipyOptimizer().minimize(model)
Then the prediction model is build without passing the real values but a tensor
X = tf.placeholder(shape=(1,nd),dtype=np.float64)
[mean,_] = model._build_predict(X)
Such that CasADi can substitute real values by using a callback function that calls tensorflow.
I want to use the tf.keras.Sequential() model instead of a GPflow model since I want to implement a recurrent neural network. But for the sequential model the method _build_predict(X) does not exist. I tried to use just predict but I get the following error
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype double and shape [35039,1,8]
[[{{node Placeholder}}]]
Do you know what is the equivalent in this case?
Here the complete code using GPflow
from casadi import *
T = 10. # Time horizon
N = 20 # number of control intervals
# Declare model variables
x1 = MX.sym('x1')
x2 = MX.sym('x2')
x = vertcat(x1, x2)
u = MX.sym('u')
# Model equations
xdot = vertcat((1-x2**2)*x1 - x2 + u, x1)
# Formulate discrete time dynamics
if False:
# CVODES from the SUNDIALS suite
dae = {'x':x, 'p':u, 'ode':xdot}
opts = {'tf':T/N}
F = integrator('F', 'cvodes', dae, opts)
else:
# Fixed step Runge-Kutta 4 integrator
M = 4 # RK4 steps per interval
DT = T/N/M
f = Function('f', [x, u], [xdot])
X0 = MX.sym('X0', 2)
U = MX.sym('U')
X = X0
Q = 0
for j in range(M):
k1 = f(X, U)
k2 = f(X + DT/2 * k1, U)
k3 = f(X + DT/2 * k2, U)
k4 = f(X + DT * k3, U)
X=X+DT/6*(k1 +2*k2 +2*k3 +k4)
F = Function('F', [X0, U], [X],['x0','p'],['xf'])
# Start with an empty NLP
w=[]
w0 = []
lbw = []
ubw = []
g=[]
lbg = []
ubg = []
# "Lift" initial conditions
Xk = MX.sym('X0', 2)
w += [Xk]
lbw += [0, 1]
ubw += [0, 1]
w0 += [0, 1]
# Formulate the NLP
for k in range(N):
# New NLP variable for the control
Uk = MX.sym('U_' + str(k))
w += [Uk]
lbw += [-1]
ubw += [1]
w0 += [0]
# Integrate till the end of the interval
Fk = F(x0=Xk, p=Uk)
Xk_end = Fk['xf']
# New NLP variable for state at end of interval
Xk = MX.sym('X_' + str(k+1), 2)
w += [Xk]
lbw += [-0.25, -inf]
ubw += [ inf, inf]
w0 += [0, 0]
# Add equality constraint
g += [Xk_end-Xk]
lbg += [0, 0]
ubg += [0, 0]
nd = N+1
import gpflow
import time
from tensorflow_casadi import TensorFlowEvaluator
class GPR(TensorFlowEvaluator):
def __init__(self, model, session, opts={}):
X = tf.placeholder(shape=(1,nd),dtype=np.float64)
[mean,_] = model._build_predict(X)
mean = tf.reshape(mean,(1,1))
TensorFlowEvaluator.__init__(self,[X],[mean],session,opts)
self.counter = 0
self.time = 0
def eval(self,arg):
self.counter += 1
t0 = time.time()
ret = TensorFlowEvaluator.eval(self,arg)
self.time += time.time()-t0
return [ret]
# Create
np.random.seed(0)
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))
model = gpflow.models.GPR(data, value, gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
gpflow.train.ScipyOptimizer().minimize(model)
import tensorflow as tf
with tf.Session() as session:
model.initialize()
GPR = GPR(model, session)
w = vertcat(*w)
# Create an NLP solver
prob = {'f': GPR(w[0::3]), 'x': w , 'g': vertcat(*g)}
options = {"ipopt": {"hessian_approximation": "limited-memory"}}
solver = nlpsol('solver', 'ipopt', prob,options);
# Solve the NLP
sol = solver(x0=w0, lbx=lbw, ubx=ubw, lbg=lbg, ubg=ubg)
print("Ncalls",GPR.counter)
print("Total time [s]",GPR.time)
w_opt = sol['x'].full().flatten()
# Plot the solution
x1_opt = w_opt[0::3]
x2_opt = w_opt[1::3]
u_opt = w_opt[2::3]
tgrid = [T/N*k for k in range(N+1)]
import matplotlib.pyplot as plt
plt.figure(1)
plt.clf()
plt.plot(tgrid, x1_opt, '--')
plt.plot(tgrid, x2_opt, '-')
plt.step(tgrid, vertcat(DM.nan(1), u_opt), '-.')
plt.xlabel('t')
plt.legend(['x1','x2','u'])
plt.grid()
plt.show()
and the class TensorFlowEvaluator
import casadi
import tensorflow as tf
class TensorFlowEvaluator(casadi.Callback):
def __init__(self,t_in,t_out,session, opts={}):
"""
t_in: list of inputs (tensorflow placeholders)
t_out: list of outputs (tensors dependeant on those placeholders)
session: a tensorflow session
"""
casadi.Callback.__init__(self)
assert isinstance(t_in,list)
self.t_in = t_in
assert isinstance(t_out,list)
self.t_out = t_out
self.construct("TensorFlowEvaluator", opts)
self.session = session
self.refs = []
def get_n_in(self): return len(self.t_in)
def get_n_out(self): return len(self.t_out)
def get_sparsity_in(self,i):
return casadi.Sparsity.dense(*self.t_in[i].get_shape().as_list())
def get_sparsity_out(self,i):
return casadi.Sparsity.dense(*self.t_out[i].get_shape().as_list())
def eval(self,arg):
# Associate each tensorflow input with the numerical argument passed by CasADi
d = dict((v,arg[i].toarray()) for i,v in enumerate(self.t_in))
# Evaluate the tensorflow expressions
ret = self.session.run(self.t_out,feed_dict=d)
return ret
# Vanilla tensorflow offers just the reverse mode AD
def has_reverse(self,nadj): return nadj==1
def get_reverse(self,nadj,name,inames,onames,opts):
# Construct tensorflow placeholders for the reverse seeds
adj_seed = [tf.placeholder(shape=self.sparsity_out(i).shape,dtype=tf.float64) for i in range(self.n_out())]
# Construct the reverse tensorflow graph through 'gradients'
grad = tf.gradients(self.t_out, self.t_in,grad_ys=adj_seed)
# Create another TensorFlowEvaluator object
callback = TensorFlowEvaluator(self.t_in+adj_seed,grad,self.session)
# Make sure you keep a reference to it
self.refs.append(callback)
# Package it in the nominal_in+nominal_out+adj_seed form that CasADi expects
nominal_in = self.mx_in()
nominal_out = self.mx_out()
adj_seed = self.mx_out()
return casadi.Function(name,nominal_in+nominal_out+adj_seed,callback.call(nominal_in+adj_seed),inames,onames)
if __name__=="__main__":
from casadi import *
a = tf.placeholder(shape=(2,2),dtype=tf.float64)
b = tf.placeholder(shape=(2,1),dtype=tf.float64)
y = tf.matmul(tf.sin(a), b)
with tf.Session() as session:
f_tf = TensorFlowEvaluator([a,b], [y], session)
a = MX.sym("a",2,2)
b = MX.sym("a",2,1)
y = f_tf(a,b)
yref = mtimes(sin(a),b)
f = Function('f',[a,b],[y])
fref = Function('f',[a,b],[yref])
print(f(DM([[1,2],[3,4]]),DM([[1],[3]])))
print(fref(DM([[1,2],[3,4]]),DM([[1],[3]])))
f = Function('f',[a,b],[jacobian(y,a)])
fref = Function('f',[a,b],[jacobian(yref,a)])
print(f(DM([[1,2],[3,4]]),DM([[1],[3]])))
print(fref(DM([[1,2],[3,4]]),DM([[1],[3]])))
And here is my attempt:
# design network
model = tf.keras.Sequential()
LSTM = tf.keras.layers.LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]))
model.add(LSTM) #, input_shape=(train_X.shape[1], train_X.shape[2]))
model.add(tf.keras.layers.Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=0, shuffle=False)
with tf.Session() as session:
testXshape = test_X.shape
GPR = GPR(model, session,testXshape)
Thanks!
I've let the TensorFlowEvaluator the same and created the GPR class this way:
class ValFcn(TensorFlowEvaluator):
import tensorflow as tf
def __init__(self, NN, session, opts={}):
self.X = self.tf.placeholder(shape=(1,4), dtype=self.tf.float32)
self.output = NN(self.X)
TensorFlowEvaluator.__init__(self, [self.X], [self.output], session, opts)
def eval(self, arg):
ret = TensorFlowEvaluator.eval(self, arg)
return ret
I was working with float32 so I had to change it there and in the TensorFlowEvaluator.
I'm actually using this model as a cost function term for an OCP.
Hope it works!

error in Naive bayes classifier

i'm beginner in machine learning and i'm trying to implement my first Naive Bayes by myself for better understanding. So, i have dataset from http://archive.ics.uci.edu/ml/datasets/Adult (american census data, classes are '<=50k' and '>50k').
Here is my python code:
#!/usr/bin/python
import sys
import csv
words_stats = {} # {'word': {'class1': cnt, 'class2': cnt'}}
words_cnt = 0
targets_stats = {} # {'class1': 3234, 'class2': 884} how many words in each class
class_stats = {} # {'class1': 7896, 'class2': 3034} how many lines in each class
items_cnt = 0
def train(dataset, targets):
global words_stats, words_cnt, targets_stats, items_cnt, class_stats
num = len(dataset)
for item in xrange(num):
class_stats[targets[item]] = class_stats.get(targets[item], 0) + 1
for i in xrange(len(dataset[item])):
word = dataset[item][i]
if not words_stats.has_key(word):
words_stats[word] = {}
tgt = targets[item]
cnt = words_stats[word].get(tgt, 0)
words_stats[word][tgt] = cnt + 1
targets_stats[tgt] = targets_stats.get(tgt, 0) + 1
words_cnt += 1
items_cnt = num
def classify(doc, tgt_set):
global words_stats, words_cnt, targets_stats, items_cnt
probs = {} #the probability itself P(c|W) = P(W|c) * P(c) / P(W)
pc = {} #probability of the class in document set P(c)
pwc = {} #probability of the word set in particular class. P(W|c)
pw = 1 #probability of the word set in documet set
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
pw = pw * float(sum(words_stats[word].values())) / words_cnt
for tgt in tgt_set:
pc[tgt] = class_stats[tgt] / float(items_cnt)
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
tgt_wrd_cnt = words_stats[word].get(tgt, 0)
pwc[tgt] = pwc.get(tgt, 1) * float(tgt_wrd_cnt) / targets_stats[tgt]
probs[tgt] = (pwc[tgt] * pc[tgt]) / pw
l = sorted(probs.items(), key = lambda i: i[1], reverse=True)
print probs
return l[0][0]
def check_results(dataset, targets):
num = len(dataset)
tgt_set = set(targets)
correct = 0
incorrect = 0
for item in xrange(num):
res = classify(dataset[item], tgt_set)
if res == targets[item]:
correct = correct + 1
else:
incorrect = incorrect + 1
print 'correct:', float(correct) / num, ' incorrect:', float(incorrect) / num
def load_data(fil):
data = []
tgts = []
reader = csv.reader(fil)
for line in reader:
d = [x.strip() for x in line]
if '?' in d:
continue
if not len(d):
continue
data.append(d[:-1])
tgts.append(d[-1:][0])
return data, tgts
if __name__ == '__main__':
if len(sys.argv) < 3:
print './program train_data.txt test_data.txt'
sys.exit(1)
filename = sys.argv[1]
fil = open(filename, 'r')
data, tgt = load_data(fil)
train(data, tgt)
test_file = open(sys.argv[2], 'r')
test_data, test_tgt = load_data(test_file)
check_results(test_data, tgt)
it gives ~61% of correct results. when i print probabilities i get the following:
{'<=50K': 0.07371606889800396, '>50K': 15.325378327213354}
but in case of correct classifier i expect to see sum of both probabilities equal to 1.
At first i thought the problem is in float underflow and tried to make all calculations in logarithms, but results were similiar.
i understand that omitting some words is gonna affect accuracy, but the probabilities are sooo wrong.
What do i do wrong or don't understand?
for your convinience i've uploaded dataset and python script here:
https://dl.dropboxusercontent.com/u/36180992/adult.tar.gz
Thank you for your help.
Naive Bayes doesn't compute a probability directly, rather it computes a "raw score" that is relatively compared to the other scores for each label in order to classify an instance. This score can easily be converted to a "probability" in the range of [0, 1]:
total = sum(probs.itervalues())
for label, score in probs.iteritems():
probs[label] = score / total
However, keep in mind this still doesn't represent a true probability, as mentioned in this answer:
naive Bayes tends to predict probabilities that are almost always either very close to zero or very close to one.

Categories

Resources