When creating custom loss and metric function in a keras model it assumes for both cases that inputs are of (y_true, y_pred):
def custom_loss(y_true, y_pred):
.
return loss
def custom_metric(y_true, y_pred):
.
return metric
And the input of y_pred is the output of the Model. Example:
model = Model(inputs = [input1,..inputN], outputs=loss)
model.compile(loss=costum_loss, metrics=costum_metric)
In this case above for both loss and metric the y_pred will be the loss.
What if I want different input in costum_loss and different in the costum_metric. Is there a way to do it?
Edit:
More speciffically I want my loss to be:
def warp_loss(X):
z, positive_entity, negatives_entities = X
positiveSim = Lambda(lambda x: similarity(x[0], x[1]), output_shape=(1,), name="positive_sim")([z, positive_entity])
z_reshaped = Reshape((1, z.shape[1].value))(z)
negativeSim = Lambda(lambda x: similarity(x[0], x[1]), output_shape=(negatives_titles.shape[1].value, 1,), name="negative_sim")([z_reshaped, negatives_entities])
loss = Lambda(lambda x: max_margin_loss(x[0], x[1]), output_shape=(1,), name="max_margin")([positiveSim, negativeSim])
return loss
def mean_loss(y_true, y_pred):
return K.mean(y_pred - 0 * y_true)
and the metric:
def metric(X):
z, positive_entity, negatives_entities = X
positiveSim = Lambda(lambda x: similarity(x[0], x[1]), output_shape=(1,), name="positive_sim")([z, positive_entity])
z_reshaped = Reshape((1, z.shape[1].value))(z)
negativeSim = Lambda(lambda x: similarity(x[0], x[1]), output_shape=(negatives_titles.shape[1].value, 1,), name="negative_sim")([z_reshaped, negatives_entities])
position = K.sum(K.cast(K.greater(positiveSim, negativeSim), dtype="int32"), axis=1, keepdims=True)
accuracy = Lambda(lambda x: x / _NUMBER_OF_NEGATIVE_EXAMPLES)(position)
return accuracy
def mean_acc(y_true, y_pred):
return K.mean(y_pred - 0 * y_true)
So the first 4 lines are the same and after the two functions change. Could it be possible to use a Callback to print mean_acc?
You don't need the loss to be part of your model, you can make your model output its own outputs and later you apply the loss.
Here is a working code (it could be optimized to avoid repeating operations in both metrics and loss, by adding the commom part to the model)
I had some issues with your shapes, then I made it with arbitrary shapes. Your original lines are commented.
This code works for Keras 2.0.8, with Tensorflow 1.3.0. I suspect you're using Theano, right?
from keras.layers import *
from keras.models import *
import keras.backend as K
def get_divisor(x):
return K.sqrt(K.sum(K.square(x), axis=-1))
def similarity(a, b):
numerator = K.sum(a * b, axis=-1)
denominator = get_divisor(a) * get_divisor(b)
denominator = K.maximum(denominator, K.epsilon())
return numerator / denominator
def max_margin_loss(positive, negative):
#loss_matrix = K.maximum(0.0, 1.0 + negative - Reshape((1,))(positive))
loss_matrix = K.maximum(0.0, 1.0 + negative - positive)
loss = K.sum(loss_matrix, axis=-1, keepdims=True)
return loss
def warp_loss(X):
z = X[0]
positive_entity = X[1]
negative_entities = X[2]
positiveSim = similarity(z, positive_entity)
#z_reshaped = Reshape((1, z.shape[1].value))(z)
z_reshaped = K.expand_dims(z,axis=1)
negativeSim = similarity(z_reshaped, negative_entities)
#negativeSim = Reshape((negatives_titles.shape[1].value, 1,))
negativeSim = K.expand_dims(negativeSim,axis=-1)
loss = max_margin_loss(positiveSim, negativeSim)
return loss
def warp_metricsX(X):
z = X[0]
positive_entity = X[1]
negative_entities = X[2]
positiveSim = similarity(z, positive_entity)
#z_reshaped = Reshape((1, z.shape[1].value))(z)
z_reshaped = K.expand_dims(z,axis=1)
negativeSim = similarity(z_reshaped, negative_entities)
#Reshape((negatives_titles.shape[1].value, 1,))
negativeSim = K.expand_dims(negativeSim,axis=-1)
position = K.sum(K.cast(K.greater(positiveSim, negativeSim), dtype="int32"), axis=1, keepdims=True)
#accuracy = position / _NUMBER_OF_NEGATIVE_EXAMPLES
accuracy = position / 30
return accuracy
def mean_loss(yTrue,yPred):
return K.mean(warp_loss(yPred))
def warp_metrics(yTrue,yPred):
return warp_metricsX(yPred)
def build_nn_model():
#wl, tl = load_vector_lookups()
#embedded_layer_1 = initialize_embedding_matrix(wl)
#embedded_layer_2 = initialize_embedding_matrix(tl)
embedded_layer_1 = Embedding(200,25)
embedded_layer_2 = Embedding(200,25)
#sequence_input_1 = Input(shape=(_NUMBER_OF_LENGTH,), dtype='int32',name="text")
sequence_input_1 = Input(shape=(30,), dtype='int32',name="text")
sequence_input_positive = Input(shape=(1,), dtype='int32', name="positive")
sequence_input_negatives = Input(shape=(10,), dtype='int32', name="negatives")
embedded_sequences_1 = embedded_layer_1(sequence_input_1)
#embedded_sequences_positive = Reshape((tl.shape[1],))(embedded_layer_2(sequence_input_positive))
embedded_sequences_positive = Reshape((25,))(embedded_layer_2(sequence_input_positive))
embedded_sequences_negatives = embedded_layer_2(sequence_input_negatives)
conv_step1 = Convolution1D(
filters=1000,
kernel_size=5,
activation="tanh",
name="conv_layer_mp",
padding="valid")(embedded_sequences_1)
conv_step2 = GlobalMaxPooling1D(name="max_pool_mp")(conv_step1)
conv_step3 = Activation("tanh")(conv_step2)
conv_step4 = Dropout(0.2, name="dropout_mp")(conv_step3)
#z = Dense(wl.shape[1], name="predicted_vec")(conv_step4) # activation="linear"
z = Dense(25, name="predicted_vec")(conv_step4) # activation="linear"
model = Model(
inputs=[sequence_input_1, sequence_input_positive, sequence_input_negatives],
outputs = [z,embedded_sequences_positive,embedded_sequences_negatives]
)
model.compile(loss=mean_loss, optimizer='adam',metrics=[warp_metrics])
return model
Related
Hi I've been working on a neural network to tackle the MNIST dataset, but when I run the code the accuracy begins to increase but eventually results in 0.098 accuracy, I also encounter an overflow error in exp when calculating the SoftMax values. I have tried to debug my code but I don't understand where I'm going wrong. If anyone can point me in the right direction that would be great and if you can't find an error could you give me any tips on techniques to try to debug this. Thanks in advance.
import numpy as np
import pandas as pd
df = pd.read_csv('../input/digit-recognizer/train.csv')
data = np.array(df.values)
data = data.T
data
Y = data[0,:]
X = data[1:,:]
Y_train = Y[:41000]
X_train = X[:,:41000]
X_train = X_train/255
Y_val = Y[41000:]
X_val = X[:,41000:]
X_val = X_val/255
print(np.max(X_train))
class NeuralNetwork:
def __init__(self, n_in, n_out):
self.w1, self.b1 = self.Generate_Weights_Biases(10,784)
self.w2, self.b2 = self.Generate_Weights_Biases(10,10)
def Generate_Weights_Biases(self, n_in, n_out):
weights = 0.01*np.random.randn(n_in, n_out)
biases = np.zeros((n_in,1))
return weights, biases
def forward(self, X):
self.Z1 = self.w1.dot(X) + self.b1
self.a1 = self.ReLu(self.Z1)
self.z2 = self.w2.dot(self.a1) + self.b2
y_pred = self.Softmax(self.z2)
return y_pred
def ReLu(self, Z):
return np.maximum(Z,0)
def Softmax(self, Z):
#exponentials = np.exp(Z)
#sumexp = np.sum(np.exp(Z), axis=0)
#print(Z)
return np.exp(Z)/np.sum(np.exp(Z))
def ReLu_Derv(self, x):
return np.greaterthan(x, 0).astype(int)
def One_hot_encoding(self, Y):
one_hot = np.zeros((Y.size, 10))
rows = np.arange(Y.size)
one_hot[rows, Y] = 1
one_hot = one_hot.T
return one_hot
def Get_predictions(self, y_pred):
return np.argmax(y_pred, 0)
def accuracy(self, pred, Y):
return np.sum(pred == Y)/Y.size
def BackPropagation(self, X, Y, y_pred, lr=0.01):
m = Y.size
one_hot_y = self.One_hot_encoding(Y)
e2 = y_pred - one_hot_y
derW2 = (1/m)* e2.dot(self.a1.T)
derB2 =(1/m) * e2
#derB2 = derB2.reshape(10,1)
e1 = self.w2.T.dot(e2) * self.ReLu(self.a1)
derW1 = (1/m) * e1.dot(X.T)
derB1 = (1/m) * e1
#derB1 = derB1.reshape(10,1)
self.w1 = self.w1 - lr*derW1
self.b1 = self.b1 - lr*np.sum(derB1, axis=1, keepdims=True)
self.w2 = self.w2 - lr*derW2
self.b2 = self.b2 - lr*np.sum(derB2, axis=1, keepdims=True)
def train(self, X, Y, epochs = 1000):
for i in range(epochs):
y_pred = self.forward(X)
predict = self.Get_predictions(y_pred)
accuracy = self.accuracy(predict, Y)
print(accuracy)
self.BackPropagation(X, Y, y_pred)
return self.w1, self.b1, self.w2, self.b2
NN = NeuralNetwork(X_train, Y_train)
w1,b1,w2,b2 = NN.train(X_train,Y_train)
I found the following errors:
Your softmax implementation doesn't work because of terrific numeric errors you get exponentiating potentially large numbers to obtain something between 0 and 1. And besides, you forgot to specify the summation axis in the denominator. Here is a working implementation:
def Softmax(self, Z):
e = np.exp(Z - Z.max(axis=0, keepdims=True))
return e/e.sum(axis=0, keepdims=True)
(Here and below I skip coding-style remarks that are not essential in this context. Like that this should be a class method or a stand-alone function etc.)
Your ReLu derivative implementation doesn't work for me at all. May be I have a different numpy version. This one works:
def ReLu_Derv(self, x):
return (x > 0).astype(int)
You need to actually use this implementation in BackPropagation:
e1 = self.w2.T.dot(e2) * self.ReLu_Derv(self.a1)
With these amendments, I managed to achieve 91.0% accuracy after 100 iteration with LR=0.1. I loaded MNIST from Keras with this code:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
X = train_images.reshape(-1, 28*28).T
Y = train_labels
I'm working on the signal compression and reconstruction with VAE. I've trained 1600 fragments but the values of 1600 reconstructed signals are very similar. Moreover, results from same batch are almost consistent. As using the VAE, loss function of the model contains binary cross entropy (BCE) and the output of the train model should be located between 0 to 1 (The input data also normalized to 0~1).
VAE model(LSTM) :
class LSTM_VAE(nn.Module):
def __init__(self,
input_size=3000,
hidden=[1024, 512, 256, 128, 64],
latent_size=64,
num_layers=8,
bidirectional=True):
super().__init__()
self.input_size = input_size
self.hidden = hidden
self.latent_size = latent_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.actv = nn.LeakyReLU()
self.encode = nn.LSTM(input_size=self.input_size,
hidden_size=self.hidden[0],
num_layers=self.num_layers,
batch_first=True,
bidirectional=True)
self.bn_encode = nn.BatchNorm1d(1)
self.decode = nn.LSTM(input_size=self.latent_size,
hidden_size=self.hidden[2],
num_layers=self.num_layers,
batch_first=True,
bidirectional=True)
self.bn_decode = nn.BatchNorm1d(1)
self.fc1 = nn.Linear(self.hidden[0]*2, self.hidden[1])
self.fc2 = nn.Linear(self.hidden[1], self.hidden[2])
self.fc31 = nn.Linear(self.hidden[2], self.latent_size)
self.fc32 = nn.Linear(self.hidden[2], self.latent_size)
self.bn1 = nn.BatchNorm1d(1)
self.bn2 = nn.BatchNorm1d(1)
self.bn3 = nn.BatchNorm1d(1)
self.fc4 = nn.Linear(self.hidden[2]*2, self.hidden[1])
self.fc5 = nn.Linear(self.hidden[1], self.hidden[0])
self.fc6 = nn.Linear(self.hidden[0], self.input_size)
self.bn4 = nn.BatchNorm1d(1)
self.bn5 = nn.BatchNorm1d(1)
self.bn6 = nn.BatchNorm1d(1)
def encoder(self, x):
x = torch.unsqueeze(x, 1)
x, _ = self.encode(x)
x = self.actv(x)
x = self.fc1(x)
x = self.actv(x)
x = self.fc2(x)
x = self.actv(x)
mu = self.fc31(x)
log_var = self.fc32(x)
return mu, log_var
def decoder(self, z):
z, _ = self.decode(z)
z = self.bn_decode(z)
z = self.actv(z)
z = self.fc4(z)
z = self.bn4(z)
z = self.fc5(z)
z = self.bn5(z)
z = self.fc6(z)
z = self.bn6(z)
z = torch.sigmoid(z)
return torch.squeeze(z)
def sampling(self, mu, log_var):
std = torch.exp(0.5 * log_var)
eps = torch.randn_like(std)
return mu + eps * std
def forward(self, x):
mu, log_var = self.encoder(x.view(-1, self.input_size))
z = self.sampling(mu, log_var)
z = self.decoder(z)
return z, mu, log_var
Loss function and Train code :
def lossF(recon_x, x, mu, logvar, input_size):
BCE = F.binary_cross_entropy(recon_x, x.view(-1, input_size), reduction='sum')
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return BCE + KLD
optim = torch.optim.Adam(model.parameters(), lr=opt.lr)
for epoch in range(opt.epoch):
for batch_idx, data in enumerate(train_set):
data = data.to(device)
optim.zero_grad()
recon_x, mu, logvar = model(data)
loss = lossF(recon_x, data, mu, logvar, opt.input_size)
loss.backward()
train_loss += loss.item()
optim.step()
I built the code by refer the example codes of others and only changed very few parameters. I rebuilt the code, change the dataset, update parameters but nothing worked. If you have any suggestion to solve this problem, PLEASE let me know.
I've find out the reason of the issue. It turns out that the decoder model derives output value in the range of 0.4 to 0.6 to stabilize the BCE loss. BCE loss can't be 0 even if the prediction is correct to answer. Also the loss value is non-linear to the range of the output. The easiest way to lower the loss is give 0.5 for the output, and my model did.
To avoid this error, I standardize my data and added some outlier data to avoid BCE issue. VAE is such complicated network for sure.
I am trying to build an implicit quantile network. I build a custom loss function but do not get it working. I get the error 'no gradients available' but I belief I only use functions that should provide gradients, like tf.tile and stuff. I dont explicityly cast something in my loss_kv_iq() function.
Below I provide the code for my custom layer ( IQNlayer ) , the network I use (IQN), and my custom loss function. Also a small piece of code in the main that should be able to reproduce the error.
TF version: 2.1.0
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
class IQN(keras.Model):
def __init__(self, quantile_dims, fc_dims, n_actions, n_quantiles):
super(IQN, self).__init__()
self.n_quantiles = n_quantiles
initializer = keras.initializers.he_uniform()
self.iq = IQNlayer(quantile_dims, n_quantiles)
self.dense = keras.layers.Dense(fc_dims, activation='relu', kernel_initializer = initializer)
self.out = keras.layers.Dense(n_actions, activation = None)
def call(self, state, tau):
batch_size, state_size = state.shape
x = self.iq(state, tau)
x = self.dense(x)
x = self.out(x)
x = tf.transpose(tf.split(x, batch_size, axis=0), perm=[0, 2, 1])
return x
class IQNlayer(keras.layers.Layer):
def __init__(self, quantile_dims, n_quantiles):
super(IQNlayer, self).__init__()
self.quantile_dims = quantile_dims
self.n_quantiles = n_quantiles
self.fc1 = keras.layers.Dense(self.quantile_dims, activation = tf.nn.selu)
self.fc2 = keras.layers.Dense(self.quantile_dims, activation = tf.nn.relu)
def call(self, state, tau):
batch_size, state_size = state.shape
state_tile = tf.tile(state, [1, self.n_quantiles])
state_reshape = tf.reshape(state_tile, [-1, state_size])
state_net = self.fc1(state_reshape)
tau = tf.reshape(tau, [-1, 1])
pi_mtx = tf.constant(np.expand_dims(np.pi * np.arange(0, 64), axis=0), dtype=tf.float32)
cos_tau = tf.cos(tf.matmul(tau, pi_mtx))
phi = self.fc2(cos_tau)
net = tf.multiply(state_net, phi)
return net
def loss_kv_iq(x, tau, action_hot, theta_target):
expand_dim_action = tf.expand_dims(action_hot, -1)
main_support = tf.reduce_sum(x * expand_dim_action, axis=1)
theta_loss_tile = tf.tile(tf.expand_dims(main_support, axis=2), [1, 1, N_QUANTILES])
logit_valid_tile = tf.tile(tf.expand_dims(theta_target, axis=1), [1, N_QUANTILES, 1])
Huber_loss = hloss(logit_valid_tile, theta_loss_tile)
inv_tau = 1 - tau
tau = tf.tile(tf.expand_dims(tau, axis=1), [1, N_QUANTILES, 1])
inv_tau = tf.tile(tf.expand_dims(inv_tau, axis=1), [1, N_QUANTILES, 1])
error_loss = logit_valid_tile - theta_loss_tile
Loss = tf.where(tf.less(error_loss, 0.0), inv_tau * Huber_loss, tau * Huber_loss)
loss = tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1))
return loss
if __name__ == '__main__':
hloss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.NONE)
N_QUANTILES = 10
BATCH_SIZE = 2
ACTION_SIZE = 5
STATE_SIZE = 16
# FOR EXAMPLE: RANDOM BATCH
cs = np.random.rand(BATCH_SIZE,STATE_SIZE)
a = np.random.randint(0,5,size=(2))
r = np.random.randint(0,500,size=(2))
ns = np.random.rand(BATCH_SIZE,STATE_SIZE)
tau = np.random.uniform(size=(BATCH_SIZE, N_QUANTILES))
tau = tau.astype('float32')
iq = IQN(128,128,ACTION_SIZE,N_QUANTILES)
action_hot = np.zeros((BATCH_SIZE,ACTION_SIZE), dtype = np.float32)
action_hot[np.arange(BATCH_SIZE), a] = 1
Q = iq(ns, tau)
theta_target = np.random.rand(BATCH_SIZE,N_QUANTILES)
theta_target = theta_target.astype('float32')
optimizer = tf.keras.optimizers.Adam(lr = 1e-3)
with tf.GradientTape() as tape:
loss = loss_kv_iq(Q, tau, action_hot, theta_target)
grads = tape.gradient(loss, iq.trainable_weights)
optimizer.apply_gradients(zip(grads,iq.trainable_weights))
Error:
Traceback (most recent call last):
File "C:\Users\rensj\.spyder-py3\Thesis\test.py", line 106, in <module>
optimizer.apply_gradients(zip(grads,iq.trainable_weights))
File "C:\Users\rensj\Anaconda3\envs\tfnew\lib\site-packages\tensorflow_core\python\keras\optimizer_v2\optimizer_v2.py", line 426, in apply_gradients
grads_and_vars = _filter_grads(grads_and_vars)
File "C:\Users\rensj\Anaconda3\envs\tfnew\lib\site-packages\tensorflow_core\python\keras\optimizer_v2\optimizer_v2.py", line 1039, in _filter_grads
([v.name for _, v in grads_and_vars],))
ValueError: No gradients provided for any variable: ['iqn_4/iq_nlayer_4/dense_16/kernel:0', 'iqn_4/iq_nlayer_4/dense_16/bias:0', 'iqn_4/iq_nlayer_4/dense_17/kernel:0', 'iqn_4/iq_nlayer_4/dense_17/bias:0', 'iqn_4/dense_18/kernel:0', 'iqn_4/dense_18/bias:0', 'iqn_4/dense_19/kernel:0', 'iqn_4/dense_19/bias:0'].
EDIT:
As mister Agrawal pointed out, I use numpy operation in pi_mtx. I changed these to their tensorflow counterparts, and together with some other small change to the same line, this becomes:
pi_mtx = tf.constant(tf.expand_dims(tf.constant(np.pi) * tf.range(0, 64, dtype=tf.float32), axis=0), dtype=tf.float32)
However, I keep having the same ValueError: No gradients provided
In the line
pi_mtx = tf.constant(np.expand_dims(np.pi * np.arange(0, 64), axis=0), dtype=tf.float32)
You're using numpy functions. Change them to their tensorflow counterparts.
np.expand_dims -> tf.expand_dims
np.arange -> tf.keras.backend.arange OR tf.range
You can use np.pi, since that is a constant, not an operation.
I tried to implement a class based convolutional neural network for face expression recognition data on kaggle using tensorflow. However, for some reason my network does not train and I keep getting the same cost and error rates at each iteration.
I tried using one hot vectors for labels, changing hyperparameters but they did not have any effect on the result.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.utils import shuffle
def get_data():
df = pd.read_csv('../large_files/fer2013/fer2013.csv')
Y = df.emotion.to_numpy()
XX = df.pixels
X = []
for i in range(len(XX)):
X.append(XX[i].split())
X = np.array(X).astype(np.float)
Z = df.Usage
train = (Z == 'Training').to_list()
test = [not i for i in train]
Xtrain = X[train].astype(np.float32)
Xtrain = Xtrain.reshape((Xtrain.shape[0], int(np.sqrt(Xtrain.shape[1])), int(np.sqrt(Xtrain.shape[1])), 1))
Xtest = X[test].astype(np.float32)
Xtest = Xtest.reshape((Xtest.shape[0], int(np.sqrt(Xtest.shape[1])), int(np.sqrt(Xtest.shape[1])), 1))
Ytrain = Y[train].astype(np.int32)
Ytest = Y[test].astype(np.int32)
return Xtrain / 255, Xtest / 255, Ytrain, Ytest
def convpool(X, W, b,poolsz):
conv_out = tf.nn.conv2d(X, W, strides = [1,1,1,1], padding = 'SAME')
conv_out = tf.nn.bias_add(conv_out, b)
pool_out = tf.nn.max_pool(conv_out, ksize=[1,poolsz,poolsz,1], strides=[1,poolsz,poolsz,1], padding = 'SAME')
return tf.nn.relu(pool_out)
def init_filter(shape):
w = np.random.rand(*shape) * np.sqrt(2 / np.prod(shape[:-1]))
return w.astype(np.float32)
def error_rate(Y,T):
return np.mean(Y != T)
class FullyConnectedLayer():
def __init__(self, M1, M2, activation = tf.nn.relu):
W = np.random.randn(M1,M2) / np.sqrt(M1 + M2)
self.W = tf.Variable(W.astype(np.float32))
b = np.zeros(M2)
self.b = tf.Variable(b.astype(np.float32))
self.activation = activation
def forward(self, X):
if self.activation == None:
return tf.matmul(X, self.W) + self.b
else:
return self.activation(tf.matmul(X, self.W) + self.b)
class ConvolutionLayer():
def __init__(self, filter_shape, b, poolsz = 2):
W = init_filter(filter_shape)
self.W = tf.Variable(W)
self.b = tf.Variable(b.astype(np.float32))
self.poolsize = poolsz
def forward(self, X):
return convpool(X, self.W, self.b, self.poolsize)
class CNN():
def __init__(self, filter_shapes, dense_layer_sizes):
self.filter_shapes = filter_shapes #List of shapes
self.dense_layer_sizes = dense_layer_sizes # List of hidden units for dense layers
def fit(self, trainset, testset, learning_rate = 0.001, momentum = 0.9, decay = 0.99, batch_sz = 200, poolsize = 2):
learning_rate = np.float32(learning_rate)
momentum = np.float32(momentum)
decay = np.float32(decay)
Xtrain = trainset[0]
Ytrain = trainset[1]
Xtest = testset[0]
Ytest = testset[1]
K = len(set(Ytrain))
# Crop Train and Test sets for divisibility to batch size
Ntrain = len(Ytrain)
Ntrain = Ntrain // batch_sz * batch_sz
Xtrain = Xtrain[:Ntrain,]
Ytrain = Ytrain[:Ntrain]
Ntest = len(Ytest)
Ntest = Ntest//batch_sz * batch_sz
Xtest = Xtest[:Ntest,]
Ytest = Ytest[:Ntest]
X_shape = Xtrain.shape
width = X_shape[1]
height = X_shape[2]
# Create Convolution Layers and Store Them
self.convolutionlayers = []
for shape in self.filter_shapes:
b = np.zeros(shape[-1], dtype = np.float32)
conv = ConvolutionLayer(shape, b, poolsz = poolsize)
self.convolutionlayers.append(conv)
# Size of both width and height is halved in each max pooling so in input size of first fully connected layer is found like this
final_filter_shape = self.filter_shapes[-1]
num_convs = len(self.convolutionlayers)
M1 = int((width/(2**num_convs)) * (height/(2**num_convs)) * final_filter_shape[-1])
# Create Fully Connected Layers and Store Them
self.vanillalayers = []
for M2 in self.dense_layer_sizes:
layer = FullyConnectedLayer(M1,M2)
self.vanillalayers.append(layer)
M1 = M2
final_layer = FullyConnectedLayer(M1, K, activation = None)
self.vanillalayers.append(final_layer)
self.AllLayers = self.convolutionlayers + self.vanillalayers
tfX = tf.placeholder(dtype=tf.float32, shape= (batch_sz, width, height, 1))
tfT = tf.placeholder(dtype=tf.int32, shape = (batch_sz,))
Yish = self.forward(tfX)
cost = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = Yish, labels=tfT))
train_op = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=decay, momentum=momentum).minimize(cost)
predict_op = self.predict(tfX)
max_epoch = 10
print_period = 20
num_batches = Ntrain // batch_sz
TestCosts = []
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(max_epoch):
Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
for j in range(num_batches):
Xbatch = Xtrain[j * batch_sz: (j + 1)*batch_sz,]
Ybatch = Ytrain[j * batch_sz: (j + 1)*batch_sz,]
sess.run(train_op, feed_dict = {tfX : Xbatch, tfT : Ybatch})
if j % print_period == 0:
test_cost = 0
prediction = np.zeros(Ntest)
for k in range(Ntest // batch_sz):
Xtestbatch = Xtest[k*batch_sz:(k*batch_sz + batch_sz),]
Ytestbatch = Ytest[k*batch_sz:(k*batch_sz + batch_sz),]
test_cost += sess.run(cost, feed_dict={tfX: Xtestbatch, tfT: Ytestbatch})
prediction[k*batch_sz:(k*batch_sz + batch_sz)] = sess.run(
predict_op, feed_dict={tfX: Xtestbatch})
err = error_rate(prediction, Ytest)
print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
TestCosts.append(test_cost)
plt.plot(TestCosts)
plt.show()
def forward(self, X):
Z = X
count = 0
for layer in self.AllLayers:
# If next layer is fully connected layer, reshape Z
if count >= len(self.convolutionlayers):
Z_shape = Z.get_shape().as_list()
Z = tf.reshape(Z, [Z_shape[0], np.prod(Z_shape[1:])])
Z = layer.forward(Z)
count += 1
return Z
def predict(self, X):
out = self.forward(X)
return tf.math.argmax(out, axis = 1)
def main():
Xtrain, Xtest, Ytrain, Ytest = get_data()
trainset = [Xtrain, Ytrain]
testset = [Xtest, Ytest]
filtershapes = [(5,5,1,10), (5,5,10,20), (5,5,20,40)]
fullylayers = [500,500]
cnn = CNN(filtershapes, fullylayers)
cnn.fit(trainset, testset)
if __name__ == '__main__':
main()
everybody. I'm trying to custom a co attention layer for a matching task. And there is an error confused me a lot.
model = Model(inputs=[ans_input, ques_input], outputs=output)
my program shutdown while running the code above. then it will throw
an error
AttributeError: 'Tensor' object has no attribute '_keras_history'
it means that my model cannot be a complete graph I guess. so I have tried lots of methods which I found at StackOverflow and other blogs. But all of these cannot work. :(
I will paste my model below. Thank you for helping me :)
import time
from keras.layers import Embedding, LSTM, TimeDistributed, Lambda
from keras.layers.core import *
from keras.layers.merge import concatenate
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import *
from keras.optimizers import *
from dialog.keras_lstm.k_call import *
from dialog.model.keras_himodel import ZeroMaskedEntries, logger
class Co_AttLayer(Layer):
def __init__(self, **kwargs):
# self.input_spec = [InputSpec(ndim=3)]
super(Co_AttLayer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 2
assert len(input_shape[0]) == len(input_shape[1])
super(Co_AttLayer, self).build(input_shape)
def cosine_sim(self, x):
ans_ss = K.sum(K.square(x[0]), axis=2, keepdims=True)
ans_norm = K.sqrt(K.maximum(ans_ss, K.epsilon()))
ques_ss = K.sum(K.square(x[1]), axis=2, keepdims=True)
ques_norm = K.sqrt(K.maximum(ques_ss, K.epsilon()))
tr_ques_norm = K.permute_dimensions(ques_norm, (0, 2, 1))
tr_ques = K.permute_dimensions(x[1], (0, 2, 1))
ss = K.batch_dot(x[0], tr_ques, axes=[2, 1])
den = K.batch_dot(ans_norm, tr_ques_norm, axes=[2, 1])
return ss / den
def call(self, x, mask=None):
cosine = Lambda(self.cosine_sim)(x)
coqWij = K.softmax(cosine)
print(x[1].shape, coqWij.shape)
ai = K.dot(coqWij, x[1]) # (N A Q) (N Q L)
coaWij = K.softmax(K.permute_dimensions(cosine, (0, 2, 1)))
qj = K.dot(coaWij, x[0])
print(qj.shape, ai.shape)
return concatenate([ai, qj], axis=2)
def compute_output_shape(self, input_shape):
return input_shape
def build_QAmatch_model(opts, vocab_size=0, maxlen=300, embedd_dim=50, init_mean_value=None):
ans_input = Input(shape=(maxlen,), dtype='int32', name='ans_input')
ques_input = Input(shape=(maxlen,), dtype='int32', name='ques_input')
embedding = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=maxlen,
mask_zero=True, name='embedding')
dropout = Dropout(opts.dropout, name='dropout')
lstm = LSTM(opts.lstm_units, return_sequences=True, name='lstm')
hidden_layer = Dense(units=opts.hidden_units, name='hidden_layer')
output_layer = Dense(units=1, name='output_layer')
zme = ZeroMaskedEntries(name='maskedout')
ans_maskedout = zme(embedding(ans_input))
ques_maskedout = zme(embedding(ques_input))
ans_lstm = lstm(dropout(ans_maskedout)) # (A V)
ques_lstm = lstm(dropout(ques_maskedout)) # (Q V)
co_att = Co_AttLayer()([ans_lstm, ques_lstm])
def slice(x, index):
return x[:, :, index, :]
ans_att = Lambda(slice, output_shape=(maxlen, embedd_dim), arguments={'index': 0})(co_att)
ques_att = Lambda(slice, output_shape=(maxlen, embedd_dim), arguments={'index': 1})(co_att)
merged_ques = concatenate([ques_lstm, ques_att, ques_maskedout], axis=2)
merged_ans = concatenate([ans_lstm, ans_att, ans_maskedout], axis=2)
ans_vec = GlobalMaxPooling1D(name='ans_pooling')(merged_ans)
ques_vec = GlobalMaxPooling1D(name='ques_pooling')(merged_ques)
ans_hid = hidden_layer(ans_vec)
ques_hid = hidden_layer(ques_vec)
merged_hid = concatenate([ans_hid, ques_hid], axis=-1)
merged_all = concatenate([merged_hid, ans_hid + ques_hid, ans_hid - ques_hid, K.abs(ans_hid - ques_hid)], axis=-1)
output = output_layer(merged_all)
model = Model(inputs=[ans_input, ques_input], outputs=output)
if init_mean_value:
logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx())
model.layers[-1].b.set_value(bias_value)
if verbose:
model.summary()
start_time = time.time()
model.compile(loss='mse', optimizer='rmsprop')
total_time = time.time() - start_time
logger.info("Model compiled in %.4f s" % total_time)
return model
I can't reproduce your code, but I presume the error happens here:
merged_all = concatenate([merged_hid, ans_hid + ques_hid, ans_hid - ques_hid,
K.abs(ans_hid - ques_hid)], axis=-1)
The backend operations +, - and K.abs are not wrapped within a Lambda layer, so the resulting tensors are not Keras tensors and therefore they lack some attributes such as _keras_history. You could wrap them as follows:
l1 = Lambda(lambda x: x[0] + x[1])([ans_hid, ques_hid])
l2 = Lambda(lambda x: x[0] - x[1])([ans_hid, ques_hid])
l3 = Lambda(lambda x: K.abs(x[0] - x[1]))([ans_hid, ques_hid])
merged_all = concatenate([merged_hid, l1, l2, l3], axis=-1)
NOTE: Not tested.