Hi I've been working on a neural network to tackle the MNIST dataset, but when I run the code the accuracy begins to increase but eventually results in 0.098 accuracy, I also encounter an overflow error in exp when calculating the SoftMax values. I have tried to debug my code but I don't understand where I'm going wrong. If anyone can point me in the right direction that would be great and if you can't find an error could you give me any tips on techniques to try to debug this. Thanks in advance.
import numpy as np
import pandas as pd
df = pd.read_csv('../input/digit-recognizer/train.csv')
data = np.array(df.values)
data = data.T
data
Y = data[0,:]
X = data[1:,:]
Y_train = Y[:41000]
X_train = X[:,:41000]
X_train = X_train/255
Y_val = Y[41000:]
X_val = X[:,41000:]
X_val = X_val/255
print(np.max(X_train))
class NeuralNetwork:
def __init__(self, n_in, n_out):
self.w1, self.b1 = self.Generate_Weights_Biases(10,784)
self.w2, self.b2 = self.Generate_Weights_Biases(10,10)
def Generate_Weights_Biases(self, n_in, n_out):
weights = 0.01*np.random.randn(n_in, n_out)
biases = np.zeros((n_in,1))
return weights, biases
def forward(self, X):
self.Z1 = self.w1.dot(X) + self.b1
self.a1 = self.ReLu(self.Z1)
self.z2 = self.w2.dot(self.a1) + self.b2
y_pred = self.Softmax(self.z2)
return y_pred
def ReLu(self, Z):
return np.maximum(Z,0)
def Softmax(self, Z):
#exponentials = np.exp(Z)
#sumexp = np.sum(np.exp(Z), axis=0)
#print(Z)
return np.exp(Z)/np.sum(np.exp(Z))
def ReLu_Derv(self, x):
return np.greaterthan(x, 0).astype(int)
def One_hot_encoding(self, Y):
one_hot = np.zeros((Y.size, 10))
rows = np.arange(Y.size)
one_hot[rows, Y] = 1
one_hot = one_hot.T
return one_hot
def Get_predictions(self, y_pred):
return np.argmax(y_pred, 0)
def accuracy(self, pred, Y):
return np.sum(pred == Y)/Y.size
def BackPropagation(self, X, Y, y_pred, lr=0.01):
m = Y.size
one_hot_y = self.One_hot_encoding(Y)
e2 = y_pred - one_hot_y
derW2 = (1/m)* e2.dot(self.a1.T)
derB2 =(1/m) * e2
#derB2 = derB2.reshape(10,1)
e1 = self.w2.T.dot(e2) * self.ReLu(self.a1)
derW1 = (1/m) * e1.dot(X.T)
derB1 = (1/m) * e1
#derB1 = derB1.reshape(10,1)
self.w1 = self.w1 - lr*derW1
self.b1 = self.b1 - lr*np.sum(derB1, axis=1, keepdims=True)
self.w2 = self.w2 - lr*derW2
self.b2 = self.b2 - lr*np.sum(derB2, axis=1, keepdims=True)
def train(self, X, Y, epochs = 1000):
for i in range(epochs):
y_pred = self.forward(X)
predict = self.Get_predictions(y_pred)
accuracy = self.accuracy(predict, Y)
print(accuracy)
self.BackPropagation(X, Y, y_pred)
return self.w1, self.b1, self.w2, self.b2
NN = NeuralNetwork(X_train, Y_train)
w1,b1,w2,b2 = NN.train(X_train,Y_train)
I found the following errors:
Your softmax implementation doesn't work because of terrific numeric errors you get exponentiating potentially large numbers to obtain something between 0 and 1. And besides, you forgot to specify the summation axis in the denominator. Here is a working implementation:
def Softmax(self, Z):
e = np.exp(Z - Z.max(axis=0, keepdims=True))
return e/e.sum(axis=0, keepdims=True)
(Here and below I skip coding-style remarks that are not essential in this context. Like that this should be a class method or a stand-alone function etc.)
Your ReLu derivative implementation doesn't work for me at all. May be I have a different numpy version. This one works:
def ReLu_Derv(self, x):
return (x > 0).astype(int)
You need to actually use this implementation in BackPropagation:
e1 = self.w2.T.dot(e2) * self.ReLu_Derv(self.a1)
With these amendments, I managed to achieve 91.0% accuracy after 100 iteration with LR=0.1. I loaded MNIST from Keras with this code:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
X = train_images.reshape(-1, 28*28).T
Y = train_labels
Related
Hi I'm trying to train my own designed neural network on the MNIST handwritten data set and every time I run this code the accuracy starts to increase then decreases and I get an overflow warning. Can someone explain whether my code is just poor and messy or whether I have just missed something little out. Thanks in advance
import numpy as np
import pandas as pd
df = pd.read_csv('../input/digit-recognizer/train.csv')
data = np.array(df.values)
data = data.T
data
Y = data[0,:]
X = data[1:,:]
Y_train = Y[:41000]
X_train = X[:,:41000]
X_train = X_train/255
Y_val = Y[41000:]
X_val = X[:,41000:]
X_val = X_val/255
print(np.max(X_train))
class NeuralNetwork:
def __init__(self, n_in, n_out):
self.w1, self.b1 = self.Generate_Weights_Biases(10,784)
self.w2, self.b2 = self.Generate_Weights_Biases(10,10)
def Generate_Weights_Biases(self, n_in, n_out):
weights = 0.01*np.random.randn(n_in, n_out)
biases = np.zeros((n_in,1))
return weights, biases
def forward(self, X):
self.Z1 = self.w1.dot(X) + self.b1
self.a1 = self.ReLu(self.Z1)
self.z2 = self.w2.dot(self.a1) + self.b1
y_pred = self.Softmax(self.z2)
return y_pred
def ReLu(self, Z):
return np.maximum(Z,0)
def Softmax(self, Z):
#exponentials = np.exp(Z)
#sumexp = np.sum(np.exp(Z), axis=0)
#print(Z)
return np.exp(Z)/np.sum(np.exp(Z))
def ReLu_Derv(self, x):
return np.greaterthan(x, 0).astype(int)
def One_hot_encoding(self, Y):
one_hot = np.zeros((Y.size, 10))
rows = np.arange(Y.size)
one_hot[rows, Y] = 1
one_hot = one_hot.T
return one_hot
def Get_predictions(self, y_pred):
return np.argmax(y_pred, 0)
def accuracy(self, pred, Y):
return np.sum(pred == Y)/Y.size
def BackPropagation(self, X, Y, y_pred, lr=0.01):
m = Y.size
one_hot_y = self.One_hot_encoding(Y)
e2 = y_pred - one_hot_y
derW2 = (1/m)* e2.dot(self.a1.T)
derB2 =(1/m) * np.sum(e2,axis=1)
derB2 = derB2.reshape(10,1)
e1 = self.w2.T.dot(e2) * self.ReLu(self.a1)
derW1 = (1/m) * e1.dot(X.T)
derB1 = (1/m) * np.sum(e1, axis=1)
derB1 = derB1.reshape(10,1)
self.w1 = self.w1 - lr*derW1
self.b1 = self.b1 - lr*derB1
self.w2 = self.w2 - lr*derW2
self.b2 = self.b2 - lr*derB2
def train(self, X, Y, epochs = 1000):
for i in range(epochs):
y_pred = self.forward(X)
predict = self.Get_predictions(y_pred)
accuracy = self.accuracy(predict, Y)
print(accuracy)
self.BackPropagation(X, Y, y_pred)
return self.w1, self.b1, self.w2, self.b2
NN = NeuralNetwork(X_train, Y_train)
w1,b1,w2,b2 = NN.train(X_train,Y_train)
You should use a different bias for the second layer
self.z2 = self.w2.dot(self.a1) + self.b1 # not b1
self.z2 = self.w2.dot(self.a1) + self.b2 # but b2
When doing something like this
derB2 =(1/m) * np.sum(e2,axis=1)
you would like to use (keepdims = True) to make sure that derB2.shape is (something,1) but not (something, ). It makes your code more rigorous.
I'm working on the signal compression and reconstruction with VAE. I've trained 1600 fragments but the values of 1600 reconstructed signals are very similar. Moreover, results from same batch are almost consistent. As using the VAE, loss function of the model contains binary cross entropy (BCE) and the output of the train model should be located between 0 to 1 (The input data also normalized to 0~1).
VAE model(LSTM) :
class LSTM_VAE(nn.Module):
def __init__(self,
input_size=3000,
hidden=[1024, 512, 256, 128, 64],
latent_size=64,
num_layers=8,
bidirectional=True):
super().__init__()
self.input_size = input_size
self.hidden = hidden
self.latent_size = latent_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.actv = nn.LeakyReLU()
self.encode = nn.LSTM(input_size=self.input_size,
hidden_size=self.hidden[0],
num_layers=self.num_layers,
batch_first=True,
bidirectional=True)
self.bn_encode = nn.BatchNorm1d(1)
self.decode = nn.LSTM(input_size=self.latent_size,
hidden_size=self.hidden[2],
num_layers=self.num_layers,
batch_first=True,
bidirectional=True)
self.bn_decode = nn.BatchNorm1d(1)
self.fc1 = nn.Linear(self.hidden[0]*2, self.hidden[1])
self.fc2 = nn.Linear(self.hidden[1], self.hidden[2])
self.fc31 = nn.Linear(self.hidden[2], self.latent_size)
self.fc32 = nn.Linear(self.hidden[2], self.latent_size)
self.bn1 = nn.BatchNorm1d(1)
self.bn2 = nn.BatchNorm1d(1)
self.bn3 = nn.BatchNorm1d(1)
self.fc4 = nn.Linear(self.hidden[2]*2, self.hidden[1])
self.fc5 = nn.Linear(self.hidden[1], self.hidden[0])
self.fc6 = nn.Linear(self.hidden[0], self.input_size)
self.bn4 = nn.BatchNorm1d(1)
self.bn5 = nn.BatchNorm1d(1)
self.bn6 = nn.BatchNorm1d(1)
def encoder(self, x):
x = torch.unsqueeze(x, 1)
x, _ = self.encode(x)
x = self.actv(x)
x = self.fc1(x)
x = self.actv(x)
x = self.fc2(x)
x = self.actv(x)
mu = self.fc31(x)
log_var = self.fc32(x)
return mu, log_var
def decoder(self, z):
z, _ = self.decode(z)
z = self.bn_decode(z)
z = self.actv(z)
z = self.fc4(z)
z = self.bn4(z)
z = self.fc5(z)
z = self.bn5(z)
z = self.fc6(z)
z = self.bn6(z)
z = torch.sigmoid(z)
return torch.squeeze(z)
def sampling(self, mu, log_var):
std = torch.exp(0.5 * log_var)
eps = torch.randn_like(std)
return mu + eps * std
def forward(self, x):
mu, log_var = self.encoder(x.view(-1, self.input_size))
z = self.sampling(mu, log_var)
z = self.decoder(z)
return z, mu, log_var
Loss function and Train code :
def lossF(recon_x, x, mu, logvar, input_size):
BCE = F.binary_cross_entropy(recon_x, x.view(-1, input_size), reduction='sum')
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return BCE + KLD
optim = torch.optim.Adam(model.parameters(), lr=opt.lr)
for epoch in range(opt.epoch):
for batch_idx, data in enumerate(train_set):
data = data.to(device)
optim.zero_grad()
recon_x, mu, logvar = model(data)
loss = lossF(recon_x, data, mu, logvar, opt.input_size)
loss.backward()
train_loss += loss.item()
optim.step()
I built the code by refer the example codes of others and only changed very few parameters. I rebuilt the code, change the dataset, update parameters but nothing worked. If you have any suggestion to solve this problem, PLEASE let me know.
I've find out the reason of the issue. It turns out that the decoder model derives output value in the range of 0.4 to 0.6 to stabilize the BCE loss. BCE loss can't be 0 even if the prediction is correct to answer. Also the loss value is non-linear to the range of the output. The easiest way to lower the loss is give 0.5 for the output, and my model did.
To avoid this error, I standardize my data and added some outlier data to avoid BCE issue. VAE is such complicated network for sure.
I have created multiple layer model, and now I would like to teach it with hundreds of values, so it can predict outputs from different inputs. But how should I implement those inputs? I tried now to make some array in array. And feed inputs and outputs one by one using training function. But it seems that on the second time its reteaching itself and it predicts only second answer rightly. Maybe I dont understand the concept?
import tensorflow as tf
import numpy as np
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))
x = np.array([[[10, 10, 30, 20], [20, 10, 20, 10],]])
y = np.array([[[10, 10, 100, 10], [100, 10, 10, 10],]])
class Model(object):
def __init__(self, x, y):
# get random values.
self.W = tf.Variable(tf.random.normal((len(x), len(x[0][0]))))
self.b = tf.Variable(tf.random.normal((len(y),)))
self.W1 = tf.Variable(tf.random.normal((len(x), len(x[0][0]))))
self.b1 = tf.Variable(tf.random.normal((len(y),)))
self.W2 = tf.Variable(tf.random.normal((len(x), len(x[0][0]))))
self.b2 = tf.Variable(tf.random.normal((len(y),)))
def __call__(self, x):
out1 = tf.multiply(x, self.W) + self.b
out2 = tf.multiply(out1, self.W1) + self.b1
last_layer = tf.multiply(out2, self.W2) + self.b2
# Input_Leyer = self.W * x + self.b
return last_layer
def loss(predicted_y, desired_y):
return tf.reduce_sum(tf.square(predicted_y - desired_y))
optimizer = tf.optimizers.Adam(0.1)
# noinspection PyPep8Naming
def train(model, inputs, outputs):
with tf.GradientTape() as t:
current_loss = loss(model(inputs), outputs)
grads = t.gradient(current_loss, [model.W, model.b, model.W1, model.b1, model.W2, model.b2])
optimizer.apply_gradients(zip(grads, [model.W, model.b, model.W1, model.b1, model.W2, model.b2]))
print(current_loss)
model = Model(x, y)
for i in range(5000):
train(model, x[0][0], y[0][0])
for i in range(10000):
train(model, x[0][1], y[0][1])
for i in range(3):
InputX = np.array([
[input(), input(), input(), input()],
])
#returning = tf.math.multiply(InputX, model.W, name=None )
first = tf.multiply(InputX, model.W)
second = tf.multiply(first, model.W1)
returning = tf.multiply(second, model.W2)
print("I predict:", returning)
You have to feed mixed data:
for i in range(5000):
train(model, x[0][0], y[0][0])
train(model, x[0][1], y[0][1])
Given data in the form x, y such that y = A sin(B(x) + C) + D, identify A, B, C, and D using Tensorflow.
I have written the following code to do so, but unfortunately it does not learn. Note here the problem is not to predict the sine curve correctly, but to identify the variables. Bonus points if it is possible to change the function's form to y = A * X_2 * sin (B(X_1) + C) + D.
x = np.linspace(0, 100, 1000)
A = np.random.normal(1)
B = np.random.normal(.5)
C = np.random.normal(1)
D = np.random.normal(1)
y = A*np.sin((B*x) + C) + D
x = tf.constant([x.astype('float32')])
y = tf.constant([y.astype('float32')])
class Addition(tf.Module):
def __init__(self, inputs, name=None):
super().__init__(name=name)
self.b_1 = tf.Variable(tf.random.normal([inputs]), name='b1')
self.b_2 = tf.Variable(tf.random.normal([inputs]), name='b2')
def __call__(self, x):
out = tf.math.multiply(x, self.b_1) + self.b_2
return out
class Sinusoid(tf.Module):
def __init__(self, inputs, name=None):
super().__init__(name=name)
def __call__(self, x):
sine = tf.math.sin(x)
return sine
class Sine_Model(tf.Module):
def __init__(self, name=None):
super().__init__(name=name)
self.add_1 = Addition(inputs=1)
self.sin_1 = Sinusoid(inputs=1)
self.add_2 = Addition(inputs=1)
def __call__(self, x):
x = self.add_1(x)
x = self.sin_1(x)
x = self.add_2(x)
return x
model = Sine_Model(name='sine')
loss_object = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=.1)
train_loss = tf.keras.metrics.Mean(name='train_loss')
#tf.function
def train_step(x, y):
with tf.GradientTape() as tape:
predictions = model(x)
loss = loss_object(y, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
EPOCHS = 200
for epoch in range(EPOCHS):
# Reset the metrics at the start of the next epoch
train_loss.reset_states()
train_step(x, y)
template = 'Epoch {}, Loss: {}'
#print(template.format(epoch + 1,
# train_loss.result()))
y_predicted = sine_model(x)
plt.scatter(x, y_predicted.numpy()[0])
plt.scatter(x, y, c='r')
I did see an answer to this question using scipy here. But I would like to see if it is possible to do using Tensorflow specifically, as I am interested in modularity and would like to be able to solve the problem noted as a bonus above (y = A * X_2 * sin (B(X_1) + C) + D).
Thanks!
I am trying to write a DC Gan network using Google's Jax machine learning library. To do this, I created objects to serve as the discriminator and generator, however, as I was testing the discriminator, I got the error:
TypeError: Argument '<__main__.Discriminator object at 0x7fdfa5c6ffd0>' of type <class '__main__.Discriminator'> is not a valid JAX type
I looked through the examples on the Jax github page, and, from what I saw, none of the examples there use objects, which leads me to hypothesize that it is probably just not possible to use objects with Jax. But if this is the case, I don't really understand why the use of objects wouldn't be possible, and would this be something that will be implemented in the future? Am I just naively overlooking something?
Here is my Discriminator object:
class Discriminator():
def __init__(self):
self.step_size = 0.0001
self.image_shape = (256,256,3)
self.params = []
num_layers = 6
num_filters = 64
filter_size = 4
self.params.append(create_conv_layer(3,
num_filters,
filter_size,
filter_size,
random.PRNGKey(0)))
for l in range(1, num_layers):
self.params.append(create_conv_layer(64*2**(l-1),
64*2**l,
filter_size,
filter_size,
random.PRNGKey(0)))
self.params.append(create_conv_layer(64*2**num_filters,
1,
filter_size,
filter_size,
random.PRNGKey(0)))
def predict(self):
activations = image
for w, b in params[:-1]:
outputs = conv_forward(activations,w,b,stride=2)
outputs = batch_normalization(outputs)
activations = leaky_relu(outputs)
final_w, final_b = params[-1]
return sigmoid(conv_forward(activations,final_w,final_b,))
def batched_predict(self, images):
shape = [None] + list(self.image_shape)
return vmap(self.predict, in_axes=shape)(self.params, images)
def loss(self, params, images, targets):
preds = self.batched_predict(params, images)
return -np.sum(preds * targets)
def accuracy(self, images, targets):
predicted_class = np.round(np.ravel(batched_predict(images)))
return np.mean(predicted_class == target_class)
#jit
def update(self, params, x, y):
grads = grad(self.loss)(params, x, y)
return [(w - self.step_size * dw, b - self.step_size * db)
for (w, b), (dw, db) in zip(params, grads)]
And I update the parameters here:
num_epochs = 5
batch_size = 64
steps_per_epoch = train_images.shape[0] // batch_size
discrim = Discriminator()
params = discrim.params
print("lets-a-go!")
for epoch in range(num_epochs):
start_time = time.time()
for step in range(steps_per_epoch):
x, y = simple_data_generator(batch_size)
params = discrim.update(params, x, y)
epoch_time = time.time() - start_time
train_acc = discrim.accuracy(train_images, train_labels)
test_acc = discrim.accuracy(test_images, test_labels)
print("Epoch {} in {:0.2f} sec".format(epoch, epoch_time))
print("Training set accuracy {}".format(train_acc))
print("Test set accuracy {}".format(test_acc))