I'm new to ML and have tried to use this GitHub repository to build an MNIST Machine Learning Model.
Since I have to import the dataset from my computer, I had to change things a bit. My imported dataset also doesn't include all 10 digits, but only 5.
The calculated accuracy is 96%, however when I cross check the .png files on my computer with the outcome txt, the labels make zero sense. It labels some 4's as 7's, some 2's as 5's and so on.
This is what the folder structure looks like on my computer:
2
-->001.png
-->002.png
-->003.png
-->...
3
-->001.png
-->002.png
-->003.png
-->...
4
-->001.png
-->002.png
-->003.png
-->...
5
-->001.png
-->002.png
-->003.png
-->...
7
-->001.png
-->002.png
-->003.png
-->...
Question 1:
I previously had the error that it expected 8 different categories since 7 is the highest digit label. I didn't know how to fix this, so I rename the folders from 0 to 4. Any idea how to fix this, without having to rename all folders?
Question 2:
Do you know why the outcome doesn't make any sense? It doesn't seem to be an overfitting issue, I've tried adjusting the training-test split, which didn't have any impact.
from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import time
#x, y = fetch_openml('mnist_784', version=1, return_X_y=True)
import os
from os import listdir
from os.path import isfile, join
import cv2
label_folder_training = []
label_files_training = []
total_size_training = 0
total_size_testing = 0
data_path_training = r"Training_data"
data_path_testing = r"Testing_data"
for root, dirs, files in os.walk(data_path_training):
for dir in dirs:
label_folder_training.append(dir)
total_size_training += len(files)
for file in files:
label_files_training.append(file)
for root, dirs, files in os.walk(data_path_testing):
total_size_testing += len(files)
#to ignore .DS_Store file
total_size_training = total_size_training - 1
total_size_testing = total_size_testing
print("found", total_size_training, "training files and", total_size_testing, "testing files.")
print("folder Training:",label_folder_training)
# Print returns the following:
#found 20000 training files and 5000 testing files.
#folder Training: ['0', '1', '4', '3', '2']
x = []
y = []
for i in range(len(label_folder_training)):
labelPath_training = os.path.join(data_path_training,label_folder_training[i])
FileName = [f for f in listdir(labelPath_training) if isfile(join(labelPath_training, f))]
for j in range(len(FileName)):
path = os.path.join(labelPath_training,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
x.append(img)
y.append(label_folder_training[i])
x = np.array(x)
x = np.reshape(x, (20000, 784))
x = (x/255).astype('float32')
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
import pandas as pd
class Module:
def __init__(self):
self._train = True
def forward(self, input):
raise NotImplementedError
def backward(self, input, grad_output):
raise NotImplementedError
def parameters(self):
"""
Returns list of its parameters
"""
return []
def grad_parameters(self):
"""
Returns list of tensors gradients of its parameters
"""
return []
def train(self):
self._train = True
def eval(self):
self._train = False
class Criterion:
def forward(self, input, target):
raise NotImplementedError
def backward(self, input, target):
raise NotImplementedError
class Linear(Module):
def __init__(self, dim_in, dim_out):
super().__init__()
self.W = np.random.randn(dim_in, dim_out)
self.b = np.random.randn(1, dim_out)
def forward(self, input):
self.output = (np.dot(input, self.W) + self.b)
return self.output
def backward(self, input, grad_output):
self.grad_b = np.mean(grad_output, axis=0)
self.grad_W = np.dot(input.T, grad_output)
self.grad_W /= input.shape[0]
grad_input = np.dot(grad_output, self.W.T)
return grad_input
def parameters(self):
return [self.W, self.b]
def grad_parameters(self):
return [self.grad_W, self.grad_b]
def softmax(xs):
xs = np.subtract(xs, xs.max(axis=1, keepdims=True))
xs = np.exp(xs) / np.sum(np.exp(xs), axis=1, keepdims=True)
return xs
class CrossEntropy(Criterion):
def __init__(self):
super().__init__()
def forward(self, input, target):
eps = 1e-9
predictions = np.clip(input, eps, 1. - eps)
N = predictions.shape[0]
ce = -np.sum(target * np.log(predictions))
return ce / N
def backward(self, input, target):
eps = 1e-9
input_clamp = np.clip(input, eps, 1 - eps)
return softmax(input_clamp) - target
class Sequential(Module):
def __init__(self, *layers):
super().__init__()
self.layers = layers
def forward(self, input):
for layer in self.layers:
input = layer.forward(input)
self.output = input
return self.output
def backward(self, input, grad_output):
for i in range(len(self.layers) - 1, 0, -1):
grad_output = self.layers[i].backward(self.layers[i-1].output, grad_output)
grad_output = self.layers[0].backward(input, grad_output)
return grad_output
def parameters(self):
res = []
for l in self.layers:
res += l.parameters()
return res
def grad_parameters(self):
res = []
for l in self.layers:
res += l.grad_parameters()
return res
def train(self):
for layer in self.layers:
layer.train()
def eval(self):
for layer in self.layers:
layer.eval()
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class Sigmoid(Module):
def __init__(self):
super().__init__()
def forward(self, input):
self.output = sigmoid(input)
return self.output
def backward(self, input, grad_output):
grad_input = sigmoid(input) * (1 - sigmoid(input)) * grad_output
return grad_input
class SoftMax(Module):
def __init__(self):
super().__init__()
def forward(self, input):
self.output = np.subtract(input, input.max(axis=1, keepdims=True))
self.output = np.exp(self.output) / np.sum(np.exp(self.output), axis=1, keepdims=True)
return self.output
def backward(self, input, grad_output):
return grad_output
def DataLoader(X, Y, batch_size=32):
n = X.shape[0]
indices = np.arange(n)
np.random.shuffle(indices)
for start in range(0, n, batch_size):
end = min(start + batch_size, n)
batch_idx = indices[start:end]
yield X[batch_idx], Y[batch_idx]
def accuracy_score(y_true, y_pred):
a = np.argmax(y_true, axis=1)
b = np.argmax(y_pred, axis=1)
return np.count_nonzero(a == b) / y_true.shape[0]
class Adam:
def __init__(self, model):
self.prev_m = None
self.prev_v = None
self.model = model
self.t = 1
def step(self, lr, beta1, beta2):
prev_m_tmp = []
prev_v_tmp = []
eps = 1e-7
for i, (weights, gradient) in enumerate(zip(self.model.parameters(), self.model.grad_parameters())):
if self.prev_m and self.prev_v:
m = beta1 * self.prev_m[i] + (1 - beta1) * gradient
v = beta2 * self.prev_v[i] + (1 - beta2) * gradient ** 2
m_hat = m / (1 - beta1 ** self.t)
v_hat = v / (1 - beta2 ** self.t)
else:
m = beta1 * 0 + (1 - beta1) * gradient
v = beta2 * 0 + (1 - beta2) * gradient ** 2
m_hat = m / (1 - beta1 ** self.t)
v_hat = v / (1 - beta2 ** self.t)
weights -= lr * m_hat / (np.sqrt(v_hat) + eps)
prev_m_tmp.append(m)
prev_v_tmp.append(v)
self.prev_m = prev_m_tmp
self.prev_v = prev_v_tmp
self.t += 1
model = Sequential(
Linear(784, 512),
Sigmoid(),
Linear(512, 256),
Sigmoid(),
Linear(256, 128),
Sigmoid(),
Linear(128, 64),
Sigmoid(),
Linear(64, 5),
SoftMax(),
)
epochs = 20
eval_every = 1
batch_size = 1024
criterion = CrossEntropy()
optimizer = Adam(model)
for epoch in range(epochs):
for x, y in DataLoader(X_train, y_train, batch_size=batch_size):
model.train()
y_pred = model.forward(x)
grad = criterion.backward(y_pred, y)
model.backward(x, grad)
optimizer.step(lr=0.003, beta1=0.9, beta2=0.999)
if (epoch + 1) % eval_every == 0:
model.eval()
y_train_pred = model.forward(X_train)
y_test_pred = model.forward(X_test)
loss_train = criterion.forward(y_train_pred, y_train)
loss_test = criterion.forward(y_test_pred, y_test)
print(f'Epoch: {epoch + 1}/{epochs}')
print(f'Train Loss: {loss_train} Train Accuracy: {accuracy_score(y_train, y_train_pred)}')
print(f'Test Loss: {loss_test} Test Accuracy: {accuracy_score(y_test, y_test_pred)} \n')
# Returns the following in epoch 20/20:
# Epoch: 20/20
# Train Loss: 0.151567557756849 Train Accuracy: 0.9905
# Test Loss: 0.706321046620394 Test Accuracy: 0.9563333333333334
test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
x_val = np.array(test_x)
x_val = np.reshape(x_val, (5000, 784))
x_val = (x_val/255).astype('float32')
df_test = pd.DataFrame(x_val,columns=range(784)).add_prefix('pixels_')
output = model.forward(df_test)
output_arg = np.argmax(output, axis=1)
ImageId = df_test.index +1
submission = pd.DataFrame({'ImageId': ImageId, 'Label': output})
submission['ImageId'] = submission['ImageId'].apply('{:0>4}'.format)
submission.to_csv('export.txt', sep=' ', index=False, header=False)
Found the answer to my problem. The output didn't make any sense since python was importing the testing files in a random order. All I had to do was to sort FileName before letting the model run.
I changed this
test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
to this:
test_x=[]
FileName = sorted( filter( lambda x: os.path.isfile(os.path.join(data_path_testing, x)),
os.listdir(data_path_testing) ) )
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
Related
I have a Unet model written in TensorFlow that I would like to train on a dataset of cloud images. To do so I created a DataGenerator like this
class CloudDataGen(tf.keras.utils.Sequence):
def __init__(
self,
list_ids,
batch_size,
input_size=(512, 512),
num_channels=3,
shuffle=True,
mode=None):
self.list_ids = list_ids
self.batch_size = batch_size
self.input_size = input_size
self.num_channels = num_channels
self.shuffle = shuffle
self.mode = mode
self.on_epoch_end()
def on_epoch_end(self):
self.indexes = np.arange(len(self.list_ids))
if self.shuffle:
np.random.shuffle(self.indexes)
def __len__(self):
return int(np.floor(len(self.list_ids) / self.batch_size))
def __data_generation(self, ids):
x = np.empty((self.batch_size, *self.input_size, self.num_channels))
y = np.empty((self.batch_size, *self.input_size), dtype=int)
# Generate data
for i, ID in enumerate(ids):
if self.mode == 'train':
image_dir = TRAIN_SOURCES + TRAIN_SOURCE_NAME + ID + '/image.npy'
else:
image_dir = TEST_SOURCES + TEST_SOURCE_NAME + ID + '/image.npy'
x[i, ] = np.load(image_dir)
# Store class
if self.mode == 'train':
y[i] = np.asarray(Image.open(TRAIN_LABELS + TRAIN_LABEL_NAME + ID + '/labels.tif'))
else:
y[i] = np.asarray(Image.open(TEST_LABELS + TEST_LABEL_NAME + ID + '/labels.tif'))
return x, y
def __getitem__(self, index):
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
list_ids_current_batch = [self.list_ids[k] for k in indexes]
# Generate data
x, y = self.__data_generation(list_ids_current_batch)
return x, y
Then to compile and train I use the following code
traingen, validationgen = get_data_gens()
unet_model = build_unet_model()
unet_model.compile(optimizer='adam',
loss='mse',
metrics=[tf.keras.metrics.MeanIoU(num_classes=2)]
)
NUM_EPOCHS = 2
model_history = unet_model.fit(traingen,
validation_data=validationgen,
epochs=NUM_EPOCHS
)
Unfortunately if I try to run the code I get this error
tensorflow.python.framework.errors_impl.InvalidArgumentError: buffer_size must be greater than zero. [Op:ShuffleDatasetV2] for which I can't find a solution.
Does anyone know how to fix it?
Hi I'm trying to train my own designed neural network on the MNIST handwritten data set and every time I run this code the accuracy starts to increase then decreases and I get an overflow warning. Can someone explain whether my code is just poor and messy or whether I have just missed something little out. Thanks in advance
import numpy as np
import pandas as pd
df = pd.read_csv('../input/digit-recognizer/train.csv')
data = np.array(df.values)
data = data.T
data
Y = data[0,:]
X = data[1:,:]
Y_train = Y[:41000]
X_train = X[:,:41000]
X_train = X_train/255
Y_val = Y[41000:]
X_val = X[:,41000:]
X_val = X_val/255
print(np.max(X_train))
class NeuralNetwork:
def __init__(self, n_in, n_out):
self.w1, self.b1 = self.Generate_Weights_Biases(10,784)
self.w2, self.b2 = self.Generate_Weights_Biases(10,10)
def Generate_Weights_Biases(self, n_in, n_out):
weights = 0.01*np.random.randn(n_in, n_out)
biases = np.zeros((n_in,1))
return weights, biases
def forward(self, X):
self.Z1 = self.w1.dot(X) + self.b1
self.a1 = self.ReLu(self.Z1)
self.z2 = self.w2.dot(self.a1) + self.b1
y_pred = self.Softmax(self.z2)
return y_pred
def ReLu(self, Z):
return np.maximum(Z,0)
def Softmax(self, Z):
#exponentials = np.exp(Z)
#sumexp = np.sum(np.exp(Z), axis=0)
#print(Z)
return np.exp(Z)/np.sum(np.exp(Z))
def ReLu_Derv(self, x):
return np.greaterthan(x, 0).astype(int)
def One_hot_encoding(self, Y):
one_hot = np.zeros((Y.size, 10))
rows = np.arange(Y.size)
one_hot[rows, Y] = 1
one_hot = one_hot.T
return one_hot
def Get_predictions(self, y_pred):
return np.argmax(y_pred, 0)
def accuracy(self, pred, Y):
return np.sum(pred == Y)/Y.size
def BackPropagation(self, X, Y, y_pred, lr=0.01):
m = Y.size
one_hot_y = self.One_hot_encoding(Y)
e2 = y_pred - one_hot_y
derW2 = (1/m)* e2.dot(self.a1.T)
derB2 =(1/m) * np.sum(e2,axis=1)
derB2 = derB2.reshape(10,1)
e1 = self.w2.T.dot(e2) * self.ReLu(self.a1)
derW1 = (1/m) * e1.dot(X.T)
derB1 = (1/m) * np.sum(e1, axis=1)
derB1 = derB1.reshape(10,1)
self.w1 = self.w1 - lr*derW1
self.b1 = self.b1 - lr*derB1
self.w2 = self.w2 - lr*derW2
self.b2 = self.b2 - lr*derB2
def train(self, X, Y, epochs = 1000):
for i in range(epochs):
y_pred = self.forward(X)
predict = self.Get_predictions(y_pred)
accuracy = self.accuracy(predict, Y)
print(accuracy)
self.BackPropagation(X, Y, y_pred)
return self.w1, self.b1, self.w2, self.b2
NN = NeuralNetwork(X_train, Y_train)
w1,b1,w2,b2 = NN.train(X_train,Y_train)
You should use a different bias for the second layer
self.z2 = self.w2.dot(self.a1) + self.b1 # not b1
self.z2 = self.w2.dot(self.a1) + self.b2 # but b2
When doing something like this
derB2 =(1/m) * np.sum(e2,axis=1)
you would like to use (keepdims = True) to make sure that derB2.shape is (something,1) but not (something, ). It makes your code more rigorous.
I tried to implement a class based convolutional neural network for face expression recognition data on kaggle using tensorflow. However, for some reason my network does not train and I keep getting the same cost and error rates at each iteration.
I tried using one hot vectors for labels, changing hyperparameters but they did not have any effect on the result.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.utils import shuffle
def get_data():
df = pd.read_csv('../large_files/fer2013/fer2013.csv')
Y = df.emotion.to_numpy()
XX = df.pixels
X = []
for i in range(len(XX)):
X.append(XX[i].split())
X = np.array(X).astype(np.float)
Z = df.Usage
train = (Z == 'Training').to_list()
test = [not i for i in train]
Xtrain = X[train].astype(np.float32)
Xtrain = Xtrain.reshape((Xtrain.shape[0], int(np.sqrt(Xtrain.shape[1])), int(np.sqrt(Xtrain.shape[1])), 1))
Xtest = X[test].astype(np.float32)
Xtest = Xtest.reshape((Xtest.shape[0], int(np.sqrt(Xtest.shape[1])), int(np.sqrt(Xtest.shape[1])), 1))
Ytrain = Y[train].astype(np.int32)
Ytest = Y[test].astype(np.int32)
return Xtrain / 255, Xtest / 255, Ytrain, Ytest
def convpool(X, W, b,poolsz):
conv_out = tf.nn.conv2d(X, W, strides = [1,1,1,1], padding = 'SAME')
conv_out = tf.nn.bias_add(conv_out, b)
pool_out = tf.nn.max_pool(conv_out, ksize=[1,poolsz,poolsz,1], strides=[1,poolsz,poolsz,1], padding = 'SAME')
return tf.nn.relu(pool_out)
def init_filter(shape):
w = np.random.rand(*shape) * np.sqrt(2 / np.prod(shape[:-1]))
return w.astype(np.float32)
def error_rate(Y,T):
return np.mean(Y != T)
class FullyConnectedLayer():
def __init__(self, M1, M2, activation = tf.nn.relu):
W = np.random.randn(M1,M2) / np.sqrt(M1 + M2)
self.W = tf.Variable(W.astype(np.float32))
b = np.zeros(M2)
self.b = tf.Variable(b.astype(np.float32))
self.activation = activation
def forward(self, X):
if self.activation == None:
return tf.matmul(X, self.W) + self.b
else:
return self.activation(tf.matmul(X, self.W) + self.b)
class ConvolutionLayer():
def __init__(self, filter_shape, b, poolsz = 2):
W = init_filter(filter_shape)
self.W = tf.Variable(W)
self.b = tf.Variable(b.astype(np.float32))
self.poolsize = poolsz
def forward(self, X):
return convpool(X, self.W, self.b, self.poolsize)
class CNN():
def __init__(self, filter_shapes, dense_layer_sizes):
self.filter_shapes = filter_shapes #List of shapes
self.dense_layer_sizes = dense_layer_sizes # List of hidden units for dense layers
def fit(self, trainset, testset, learning_rate = 0.001, momentum = 0.9, decay = 0.99, batch_sz = 200, poolsize = 2):
learning_rate = np.float32(learning_rate)
momentum = np.float32(momentum)
decay = np.float32(decay)
Xtrain = trainset[0]
Ytrain = trainset[1]
Xtest = testset[0]
Ytest = testset[1]
K = len(set(Ytrain))
# Crop Train and Test sets for divisibility to batch size
Ntrain = len(Ytrain)
Ntrain = Ntrain // batch_sz * batch_sz
Xtrain = Xtrain[:Ntrain,]
Ytrain = Ytrain[:Ntrain]
Ntest = len(Ytest)
Ntest = Ntest//batch_sz * batch_sz
Xtest = Xtest[:Ntest,]
Ytest = Ytest[:Ntest]
X_shape = Xtrain.shape
width = X_shape[1]
height = X_shape[2]
# Create Convolution Layers and Store Them
self.convolutionlayers = []
for shape in self.filter_shapes:
b = np.zeros(shape[-1], dtype = np.float32)
conv = ConvolutionLayer(shape, b, poolsz = poolsize)
self.convolutionlayers.append(conv)
# Size of both width and height is halved in each max pooling so in input size of first fully connected layer is found like this
final_filter_shape = self.filter_shapes[-1]
num_convs = len(self.convolutionlayers)
M1 = int((width/(2**num_convs)) * (height/(2**num_convs)) * final_filter_shape[-1])
# Create Fully Connected Layers and Store Them
self.vanillalayers = []
for M2 in self.dense_layer_sizes:
layer = FullyConnectedLayer(M1,M2)
self.vanillalayers.append(layer)
M1 = M2
final_layer = FullyConnectedLayer(M1, K, activation = None)
self.vanillalayers.append(final_layer)
self.AllLayers = self.convolutionlayers + self.vanillalayers
tfX = tf.placeholder(dtype=tf.float32, shape= (batch_sz, width, height, 1))
tfT = tf.placeholder(dtype=tf.int32, shape = (batch_sz,))
Yish = self.forward(tfX)
cost = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = Yish, labels=tfT))
train_op = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=decay, momentum=momentum).minimize(cost)
predict_op = self.predict(tfX)
max_epoch = 10
print_period = 20
num_batches = Ntrain // batch_sz
TestCosts = []
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(max_epoch):
Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
for j in range(num_batches):
Xbatch = Xtrain[j * batch_sz: (j + 1)*batch_sz,]
Ybatch = Ytrain[j * batch_sz: (j + 1)*batch_sz,]
sess.run(train_op, feed_dict = {tfX : Xbatch, tfT : Ybatch})
if j % print_period == 0:
test_cost = 0
prediction = np.zeros(Ntest)
for k in range(Ntest // batch_sz):
Xtestbatch = Xtest[k*batch_sz:(k*batch_sz + batch_sz),]
Ytestbatch = Ytest[k*batch_sz:(k*batch_sz + batch_sz),]
test_cost += sess.run(cost, feed_dict={tfX: Xtestbatch, tfT: Ytestbatch})
prediction[k*batch_sz:(k*batch_sz + batch_sz)] = sess.run(
predict_op, feed_dict={tfX: Xtestbatch})
err = error_rate(prediction, Ytest)
print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
TestCosts.append(test_cost)
plt.plot(TestCosts)
plt.show()
def forward(self, X):
Z = X
count = 0
for layer in self.AllLayers:
# If next layer is fully connected layer, reshape Z
if count >= len(self.convolutionlayers):
Z_shape = Z.get_shape().as_list()
Z = tf.reshape(Z, [Z_shape[0], np.prod(Z_shape[1:])])
Z = layer.forward(Z)
count += 1
return Z
def predict(self, X):
out = self.forward(X)
return tf.math.argmax(out, axis = 1)
def main():
Xtrain, Xtest, Ytrain, Ytest = get_data()
trainset = [Xtrain, Ytrain]
testset = [Xtest, Ytest]
filtershapes = [(5,5,1,10), (5,5,10,20), (5,5,20,40)]
fullylayers = [500,500]
cnn = CNN(filtershapes, fullylayers)
cnn.fit(trainset, testset)
if __name__ == '__main__':
main()
I'm following the guide to Transformers and the colab project https://colab.research.google.com/drive/1XBP0Zh8K4g_n0A2p1UlGFf3dij0EX_Kt
but when I run the cell with the line multi_head = build_model() I get the error.
this is the output from the console:
NameError Traceback (most recent call
last) in ()
----> 1 multi_head = build_model()
5 frames in (x)
40 self.dropout = Dropout(attn_dropout)
41 def call(self, q, k, v, mask):
---> 42 attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
43 if mask is not None:
44 mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
NameError: name 'K' is not defined
It just runs after the model architecture code, which the error refers to.
Can you see where this Kshould be defined?
import random, os, sys
import numpy as np
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.initializers import *
import tensorflow as tf
from tensorflow.python.keras.layers import Layer
try:
from dataloader import TokenList, pad_to_longest
# for transformer
except: pass
embed_size = 60
class LayerNormalization(Layer):
def __init__(self, eps=1e-6, **kwargs):
self.eps = eps
super(LayerNormalization, self).__init__(**kwargs)
def build(self, input_shape):
self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
initializer=Ones(), trainable=True)
self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
initializer=Zeros(), trainable=True)
super(LayerNormalization, self).build(input_shape)
def call(self, x):
mean = K.mean(x, axis=-1, keepdims=True)
std = K.std(x, axis=-1, keepdims=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
def compute_output_shape(self, input_shape):
return input_shape
class ScaledDotProductAttention():
def __init__(self, d_model, attn_dropout=0.1):
self.temper = np.sqrt(d_model)
self.dropout = Dropout(attn_dropout)
def __call__(self, q, k, v, mask):
attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
if mask is not None:
mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
attn = Add()([attn, mmask])
attn = Activation('softmax')(attn)
attn = self.dropout(attn)
output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])
return output, attn
class MultiHeadAttention():
# mode 0 - big martixes, faster; mode 1 - more clear implementation
def __init__(self, n_head, d_model, d_k, d_v, dropout, mode=0, use_norm=True):
self.mode = mode
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.dropout = dropout
if mode == 0:
self.qs_layer = Dense(n_head*d_k, use_bias=False)
self.ks_layer = Dense(n_head*d_k, use_bias=False)
self.vs_layer = Dense(n_head*d_v, use_bias=False)
elif mode == 1:
self.qs_layers = []
self.ks_layers = []
self.vs_layers = []
for _ in range(n_head):
self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))
self.attention = ScaledDotProductAttention(d_model)
self.layer_norm = LayerNormalization() if use_norm else None
self.w_o = TimeDistributed(Dense(d_model))
def __call__(self, q, k, v, mask=None):
d_k, d_v = self.d_k, self.d_v
n_head = self.n_head
if self.mode == 0:
qs = self.qs_layer(q) # [batch_size, len_q, n_head*d_k]
ks = self.ks_layer(k)
vs = self.vs_layer(v)
def reshape1(x):
s = tf.shape(x) # [batch_size, len_q, n_head * d_k]
x = tf.reshape(x, [s[0], s[1], n_head, d_k])
x = tf.transpose(x, [2, 0, 1, 3])
x = tf.reshape(x, [-1, s[1], d_k]) # [n_head * batch_size, len_q, d_k]
return x
qs = Lambda(reshape1)(qs)
ks = Lambda(reshape1)(ks)
vs = Lambda(reshape1)(vs)
if mask is not None:
mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)
head, attn = self.attention(qs, ks, vs, mask=mask)
def reshape2(x):
s = tf.shape(x) # [n_head * batch_size, len_v, d_v]
x = tf.reshape(x, [n_head, -1, s[1], s[2]])
x = tf.transpose(x, [1, 2, 0, 3])
x = tf.reshape(x, [-1, s[1], n_head*d_v]) # [batch_size, len_v, n_head * d_v]
return x
head = Lambda(reshape2)(head)
elif self.mode == 1:
heads = []; attns = []
for i in range(n_head):
qs = self.qs_layers[i](q)
ks = self.ks_layers[i](k)
vs = self.vs_layers[i](v)
head, attn = self.attention(qs, ks, vs, mask)
heads.append(head); attns.append(attn)
head = Concatenate()(heads) if n_head > 1 else heads[0]
attn = Concatenate()(attns) if n_head > 1 else attns[0]
outputs = self.w_o(head)
outputs = Dropout(self.dropout)(outputs)
if not self.layer_norm: return outputs, attn
# outputs = Add()([outputs, q]) # sl: fix
return self.layer_norm(outputs), attn
class PositionwiseFeedForward():
def __init__(self, d_hid, d_inner_hid, dropout=0.1):
self.w_1 = Conv1D(d_inner_hid, 1, activation='relu')
self.w_2 = Conv1D(d_hid, 1)
self.layer_norm = LayerNormalization()
self.dropout = Dropout(dropout)
def __call__(self, x):
output = self.w_1(x)
output = self.w_2(output)
output = self.dropout(output)
output = Add()([output, x])
return self.layer_norm(output)
class EncoderLayer():
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1):
self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn_layer = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
def __call__(self, enc_input, mask=None):
output, slf_attn = self.self_att_layer(enc_input, enc_input, enc_input, mask=mask)
output = self.pos_ffn_layer(output)
return output, slf_attn
def GetPosEncodingMatrix(max_len, d_emb):
pos_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
if pos != 0 else np.zeros(d_emb)
for pos in range(max_len)
])
pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
return pos_enc
def GetPadMask(q, k):
ones = K.expand_dims(K.ones_like(q, 'float32'), -1)
mask = K.cast(K.expand_dims(K.not_equal(k, 0), 1), 'float32')
mask = K.batch_dot(ones, mask, axes=[2,1])
return mask
def GetSubMask(s):
len_s = tf.shape(s)[1]
bs = tf.shape(s)[:1]
mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
return mask
class Transformer():
def __init__(self, len_limit, embedding_matrix, d_model=embed_size, \
d_inner_hid=512, n_head=10, d_k=64, d_v=64, layers=2, dropout=0.1, \
share_word_emb=False, **kwargs):
self.name = 'Transformer'
self.len_limit = len_limit
self.src_loc_info = False # True # sl: fix later
self.d_model = d_model
self.decode_model = None
d_emb = d_model
pos_emb = Embedding(len_limit, d_emb, trainable=False, \
weights=[GetPosEncodingMatrix(len_limit, d_emb)])
i_word_emb = Embedding(max_features, d_emb, weights=[embedding_matrix]) # Add Kaggle provided embedding here
self.encoder = Encoder(d_model, d_inner_hid, n_head, d_k, d_v, layers, dropout, \
word_emb=i_word_emb, pos_emb=pos_emb)
def get_pos_seq(self, x):
mask = K.cast(K.not_equal(x, 0), 'int32')
pos = K.cumsum(K.ones_like(x, 'int32'), 1)
return pos * mask
def compile(self, active_layers=999):
src_seq_input = Input(shape=(None, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(src_seq_input)
# LSTM before attention layers
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x, slf_attn = MultiHeadAttention(n_head=3, d_model=300, d_k=64, d_v=64, dropout=0.1)(x, x, x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
conc = Dense(64, activation="relu")(conc)
x = Dense(1, activation="sigmoid")(conc)
self.model = Model(inputs=src_seq_input, outputs=x)
self.model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['accuracy'])
If you look at where K is being used you will see:
K.expand_dims
K.cumsum
K.batch_dot
These are Keras backend functions. The code is missing a from keras import backend as K, which I think is a standard abbreviation.
Here is the code. I think the class mylstm has problem but I can not find it... The input is simple, which is just 7 columns data.
I tried to print out all the tensors but did not find what was wrong. Thanks for help!
class mylstm(nn.Module):
def __init__(self, input_size, hidden_size, T, logger):
super(mylstm, self).__init__()
self.T = T
self.input_size = input_size
self.hidden_size = hidden_size
self.logger = logger
self.lstm_layer = nn.LSTM(input_size = 7, hidden_size = hidden_size)
self.fc = nn.Linear(hidden_size, 1)
#self.fc.weight.data.normal_()
def forward(self, input_data):
hidden = self.init_hidden(input_data)
cell = self.init_hidden(input_data)
for t in range(self.T - 1):
if t < self.T - 1:
self.lstm_layer.flatten_parameters()
_, lstm_output = self.lstm_layer(input_data[:,t,:].unsqueeze(0), (hidden, cell))
hidden = lstm_output[0]
cell = lstm_output[1]
y_pred = self.fc(hidden[0])
return y_pred
def init_hidden(self, x):
return Variable(x.data.new(1, x.size(0), self.hidden_size).zero_())
# Train the model
class rnn:
def __init__(self, file_data, logger, input_size = 7, hidden_size = 64, T = 10,
learning_rate = 0.01, batch_size = 128, parallel = True, debug = False):
self.T = T
dat = pd.read_csv(file_data, nrows = 100 if debug else None)
self.logger = logger
self.logger.info("Shape of data: %s.\nMissing in data: %s.", dat.shape, dat.isnull().sum().sum())
self.X = dat.loc[:, [x for x in dat.columns.tolist()]].values
self.y = np.array(dat.rtm_spp)
self.batch_size = batch_size
self.lstm1 = mylstm(input_size = input_size,
hidden_size = hidden_size,
T = T, logger = logger)
if parallel:
self.lstm1 = nn.DataParallel(self.lstm1)
self.lstm1_optimizer = optim.Adam(params = filter(lambda p: p.requires_grad, self.lstm1.parameters()),
lr = learning_rate)
self.train_size = 20000
self.y = self.y - np.mean(self.y[:self.train_size]) # Question: why Adam requires data to be normalized?
self.logger.info("Training size: %d.", self.train_size)
def train(self, n_epochs = 10):
iter_per_epoch = int(np.ceil(self.train_size * 1. / self.batch_size))
logger.info("Iterations per epoch: %3.3f ~ %d.", self.train_size * 1. / self.batch_size, iter_per_epoch)
self.iter_losses = np.zeros(n_epochs * iter_per_epoch)
self.epoch_losses = np.zeros(n_epochs)
self.loss_func = nn.MSELoss()
n_iter = 0
learning_rate = 1.
for i in range(n_epochs):
perm_idx = np.random.permutation(self.train_size - self.T-1)
j = 0
while j < self.train_size:
batch_idx = perm_idx[j:(j + self.batch_size)]
X = np.zeros((len(batch_idx), self.T - 1, self.X.shape[1]))
#y_history = np.zeros((len(batch_idx), self.T - 1))
y_target = self.y[batch_idx + self.T]
for k in range(len(batch_idx)):
X[k, :, :] = self.X[batch_idx[k] : (batch_idx[k] + self.T - 1), :]
loss = self.train_iteration(X, y_target)
self.iter_losses[i * iter_per_epoch + j // self.batch_size] = loss
#if (j / self.batch_size) % 50 == 0:
j += self.batch_size
n_iter += 1
if n_iter % 10000 == 0 and n_iter > 0:
for param_group in self.lstm1_optimizer.param_groups:
param_group['lr'] = param_group['lr'] * 0.9
self.epoch_losses[i] = np.mean(self.iter_losses[range(i * iter_per_epoch, (i + 1) * iter_per_epoch)])
if i % 10 == 0:
self.logger.info("Epoch %d, loss: %3.3f.", i, self.epoch_losses[i])
y_train_pred = self.predict(on_train = True)
y_test_pred = self.predict(on_train = False)
def train_iteration(self, X,y_target):
self.lstm1_optimizer.zero_grad()
y_pred = self.lstm1(Variable(torch.from_numpy(X).type(torch.FloatTensor)))
y_true = Variable(torch.from_numpy(y_target).type(torch.FloatTensor))
y_true = y_true.view(y_true.shape[0],1)
y_pred=y_pred.squeeze(0)
print(y_pred)
loss = self.loss_func(y_pred, y_true)
loss.backward()
self.lstm1_optimizer.step()
return loss.data[0]
def predict(self, on_train = False):
if on_train:
y_pred = np.zeros(self.train_size - self.T +1)
else:
y_pred = np.zeros(self.X.shape[0] - self.train_size)
i = 0
while i < len(y_pred):
batch_idx = np.array(range(len(y_pred)))[i : (i + self.batch_size)]
X = np.zeros((len(batch_idx), self.T - 1, self.X.shape[1]))
#y_history = np.zeros((len(batch_idx), self.T - 1))
for j in range(len(batch_idx)):
if on_train:
X[j, :, :] = self.X[range(batch_idx[j], batch_idx[j] + self.T - 1), :]
else:
X[j, :, :] = self.X[range(batch_idx[j] + self.train_size - self.T, batch_idx[j] + self.train_size - 1), :]
input_data = Variable(torch.from_numpy(X).type(torch.FloatTensor))
# print(self.lstm1(torch.randn(128,9,7)))
#print(self.lstm1(X).data.numpy())
y_pred[i:(i + self.batch_size)] = self.lstm1(input_data).data.numpy()[:,0]
i += self.batch_size
return y_pred
model = rnn(file_data = 'L.csv', logger = logger, parallel = False,
learning_rate = .001)
model.train(n_epochs = 1000)
y_pred = model.predict()
It might be good if you can reduce your codes into the simplest form which still reproduce your problem. Asking people to debug over 200 lines of codes may be too big an ask. If you can give a small example of your problem, using a very simple NN model instead of the current one, many others will be willing to look into your codes and help identify the issue.