I am writing a custom layer in Tensorflow 2.0 and I ran to a problem as follow:
I want to transform a 1D weight array (5x1) to a 2D array (10x10). Suppose I have the index to transform from 1D to 2D as follow, weight_index_lst:
weight_id, row, col
1,5,6
2,6,7
3,7,8
4,8,9
5,9,10
The others location of the 2D array will just get a value of 0. Here's my script for the custom layers. My input is in (10x1) shape. For the w_mat, it receives 0 anywhere else that self.w is not assigned
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class mylayer(layers.Layer):
def __init__(self, weight_index_lst, **kwargs):
super(mylayer, self).__init__(**kwargs)
self.weight_index_lst= weight_index_lst
def build(self):
self.w = self.add_weight(shape = (5,1),
initializer = 'he_normal',
trainable = True)
def call(self, inputs):
ct = 0
w_mat = tf.Variable(np.zeros((21, 21)),dtype='float32',trainable=False)
for i in range(20):
i1 = self.weight_index_lst[i,1] #row index
i2 = self.weight_index_lst[i,2] #column index
w_mat[i1,i2].assign(self.w[ct,0]) #problem with no gradient provided
#or w_mat[i1,i2] = self.w[ct,0] #resource variable cannot be assigned
ct = ct+1
y = tf.matmul(w_mat,inputs)
return y
I could have declared a (10x10) weight array but my deep learning wants the others weight to be 0 and cannot be trained.
If you want to specifically create a new layer with the weights and such then the resolution to your problem (no gradients propagating through assign) is to change all of your operations to be symbolic tensor operations - then TF will be able to propagate the gradients. One way to do so is to create 1d tensor of weights you want to train, append non-trainable const tensor with 0.0 value and then use tf.gather to select the needed weights and/or constant zero for each of n**2 elements of the matrix you want to use to multiply the layer's input by. Since all operations are symbolic tensor operations TF will be able to propagate gradients with no problems. Code of such approach below:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
class mylayer(layers.Layer):
def __init__(self, n, weight_index_lst, **kwargs):
super(mylayer, self).__init__(**kwargs)
self.weight_index_lst = weight_index_lst
self.n = n
def build(self, input_shape):
self.w = self.add_weight(shape = (len(self.weight_index_lst),),
initializer = 'he_normal',
trainable = True)
def call(self, inputs):
const_zero = tf.constant([0.], dtype=tf.float32)
const_zero_and_weights = tf.concat([const_zero, self.w], axis=0)
ct = 1 # start with 1 since 0 means take the non-trainable 0. from const_zero_and_weights
selector = np.zeros((self.n ** 2), dtype=np.int32) # indicies
for i, j in self.weight_index_lst:
selector[i * self.n + j] = ct
ct = ct+1
t_ind = tf.constant(selector, dtype=tf.int32)
w_flattened = tf.gather(const_zero_and_weights, t_ind)
w_matrix = tf.reshape(w_flattened, (self.n, self.n))
y = tf.matmul(w_matrix, inputs)
return y
m = tf.keras.Sequential([
layers.Dense(21**2, input_shape=(45,)),
layers.Reshape(target_shape=(21,21)),
mylayer(21, [(4,5), (5,6), (6,7), (7,8), (8,9)]),
])
m.summary()
You don't need to create a trainable layer for this. Consider just using non-trainable lambda layer:
def select_as_needed(x, wrc, n):
selector = np.zeros(n * n, dtype=np.int32) # tensor with the index of input element we want to select in each cell (0 otherwise)
mask = np.zeros(n * n, dtype=np.float32) # 0./1. tensor with ones only on the positions where we put some selected element
for w, r, c in wrc:
selector[r * n + c] = w
mask[r * n + c] = 1.0
t_ind = tf.constant(selector, dtype=tf.int32)
t_mask = tf.constant(mask, dtype=tf.float32)
return tf.gather(x, t_ind, axis=1) * mask # if we don't multiply by mask the 0-index value of input will go to all positions for which we didn't select anything
wrc = [(0,4,5), (1,5,6), (2,6,7), (3,7,8), (4,8,9)] # same as your table, but 0-based
n = 10
model = tf.keras.models.Sequential([
# ... your stuff
tf.keras.layers.Dense(5, 'linear'), # output of 5 neurons (or replace with whatever else you have which is producing 5 outputs per sample)
tf.keras.layers.Lambda(select_as_needed, arguments={'wrc': wrc, 'n':n}),
tf.keras.layers.Reshape(target_shape=(n, n)),
])
Related
I wanna implement the backward propagation concept in python with the next code
class MLP(object):
def __init__(self, num_inputs=3, hidden_layers=[3, 3], num_outputs=2):
self.num_inputs = num_inputs
self.hidden_layers = hidden_layers
self.num_outputs = num_outputs
layers = [num_inputs] + hidden_layers + [num_outputs]
weights = []
bias = []
for i in range(len(layers) - 1):
w = np.random.rand(layers[i], layers[i + 1])
b=np.random.randn(layers[i+1]).reshape(1, layers[i+1])
weights.append(w)
bias.append(b)
self.weights = weights
self.bias = bias
activations = []
for i in range(len(layers)):
a = np.zeros(layers[i])
activations.append(a)
self.activations = activations
def forward_propagate(self, inputs):
activations = inputs
self.activations[0] = activations
for i, w in enumerate(self.weights):
for j, b in enumerate(self.bias):
net_inputs = self._sigmoid((np.dot(activations, w)+b))
self.activations[i + 1] = net_inputs
return activations
def train(self, inputs, targets, epochs, learning_rate):
for i in range(epochs):
sum_errors = 0
for j, input in enumerate(inputs):
target = targets[j]
output = self.forward_propagate(input)
def _sigmoid(self, x):
y = 1.0 / (1 + np.exp(-x))
return y
So I created the next dummy data in order to verify everything is correct
items = np.array([[random()/2 for _ in range(2)] for _ in range(1000)])
targets = np.array([[i[0] + i[1]] for i in items])
mlp = MLP(2, [5], 1)
mlp.train(items, targets, 2, 0.1)
but when I run the code I have the next error
ValueError: shapes (2,) and (5,1) not aligned: 2 (dim 0) != 5 (dim 0)
I understand the error, but how to solve it?
a couple of major problems with forward_propagate:
change net_inputs to activations - otherwise you always compute and return the activations from the first layer
remove for j, b in enumerate(self.bias): - biases from other layers have no business here
use matmul instead of dot
so, something like
for i, w in enumerate(self.weights):
activations = self._sigmoid((np.matmul(activations, w)+self.bias[i]))
self.activations[i + 1] = activations
return activations
Also, be careful to note that this method receives 1D array, which converts to a matrix after the first matmul. Matrixes are stored in self.activations and a matrix is returned from the method.
This might or might not be what you want.
I have a custom loss where I want to apply Gaussian filter to a predicted label to manipulate it a little. Using max or average pooling is simple as it is predefined in keras, but I had to make my own class for Gaussian pooling:
import numpy as np
from keras.layers import DepthwiseConv2D
from keras.layers import Input
from keras.models import Model
import tensorflow as tf
class Gaussian():
def __init__(self,shape, f = 3):
self.filt = f
self.g = self.gaussFilter(shape)
def doFilter(self, data):
return self.g.predict(data, steps=1) #steps are for predicting on const tensor, I change it when predicting on predictions
def gauss2D(self,shape=(3,3),sigma=0.5):
m,n = [(ss-1.)/2. for ss in shape]
y,x = np.ogrid[-m:m+1,-n:n+1]
h = np.exp( -(x*x + y*y) / (2.*sigma*sigma) )
h[ h < np.finfo(h.dtype).eps*h.max() ] = 0
sumh = h.sum()
if sumh != 0:
h /= sumh
return h
def gaussFilter(self, size=256):
kernel_weights = self.gauss2D(shape=(self.filt,self.filt))
in_channels = 1 # the number of input channels
kernel_weights = np.expand_dims(kernel_weights, axis=-1)
kernel_weights = np.repeat(kernel_weights, in_channels, axis=-1) # apply the same filter on all the input channels
kernel_weights = np.expand_dims(kernel_weights, axis=-1) # for shape compatibility reasons
inp = Input(shape=(size,size,1))
g_layer = DepthwiseConv2D(self.filt, use_bias=False, padding='same')(inp)
model_network = Model(input=inp, output=g_layer)
print(model_network.summary())
model_network.layers[1].set_weights([kernel_weights])
model_network.trainable= False
return model_network
This works as expected when feeding a constant tensor to the doFilter function, an example of simple data:
a = np.array([[[1, 2, 3], [4, 5, 6], [4, 5, 6]]])
filt = Gaussian(3)
print(filt.doFilter(tf.constant(a.reshape(1,3,3,1))))
However, if I try to use this in a custom loss :
def custom_loss_no_true(input_tensor, length):
def loss(y_true, y_pred):
gaus_pooler = Gaussian(256, length//8)
a = gaus_pooler.doFilter(y_pred)
...more stuff comes after
I get an error:
ValueError: When feeding symbolic tensors to a model, we expect the
tensors to have a static batch size. Got tensor with shape: (None,
256, 256, 1)
This is as I have found caused by the fact, that I am feeding a tensor that is an output of other model, a symbolic data, not actual values (source). Thus I need to change the logic of my approach, because evaluating the tensor to feed my class would break the graph and lead to no gradient propagation within the loss (or am I incorrect?). How can I apply such convolution operation on a tensor that is an output of other model? Is it even possible? Or maybe there is a way to use it without adding the layer to the model, such as MaxPooling?
You don't really need a complex keras Model nor a keras Layer if what you want to do is just convolve your input with a Gaussian kernel. Here is a port of your code with simple tensorflow ops :
import tensorflow as tf
def get_gaussian_kernel(shape=(3,3), sigma=0.5):
"""build the gaussain filter"""
m,n = [(ss-1.)/2. for ss in shape]
x = tf.expand_dims(tf.range(-n,n+1,dtype=tf.float32),1)
y = tf.expand_dims(tf.range(-m,m+1,dtype=tf.float32),0)
h = tf.exp(tf.math.divide_no_nan(-((x*x) + (y*y)), 2*sigma*sigma))
h = tf.math.divide_no_nan(h,tf.reduce_sum(h))
return h
def gaussian_blur(inp, shape=(3,3), sigma=0.5):
"""Convolve using tf.nn.depthwise_conv2d"""
in_channel = tf.shape(inp)[-1]
k = get_gaussian_kernel(shape,sigma)
k = tf.expand_dims(k,axis=-1)
k = tf.repeat(k,in_channel,axis=-1)
k = tf.reshape(k, (*shape, in_channel, 1))
# using padding same to preserve size (H,W) of the input
conv = tf.nn.depthwise_conv2d(inp, k, strides=[1,1,1,1],padding="SAME")
return conv
You can use it simply in your custom loss (assuming a 4D y_pred [batch, height width, channel]) :
a = gaussian_blur(y_pred)
I'm working on a school project and am stuck on how to implement backpropagation in Numpy with the current forward prop structure I have. The aim of this script is to make a simple dynamic (meaning any number of layers and nodes) fully connected network using only numpy.
I think that I have to find the derivatives of the activation functions and multipliy it by the original error as well as the derivative of each activation function I encounter moving backward.
However, I'm having trouble figuring out how to implement this correctly in my script.
It'd be a great help if someone could explain in English what exactly I have to do given the complexities of the setup here, or even give a recommendation for a video/post that deals w dynamic size backprop.
Right now all the weights and biases are being stored in lists for future backprop, and I'm able to get the error for each output with the small amount of code currently in the backprop function.
This code block
#initialize a test model w/ 128 bacth and lr of 0.01
model = Model(128, 0.01)
#simple x data input
X = np.array([[1,1],[0,0],[12,5]])
Y = np.array([[1],[0],[-1]])
#adding 4 layers
z = model.add(X, 3, "sigmoid")
z = model.add(z, 1, "sigmoid", output=True)
#this is a full forward pass through the layers
z = model.predict(X)
print(z)
#this is the error of the predictions
print(model.backprop(z, Y))
Outputs the following vectors:
[[0.50006457]
[0.50006459]
[0.50006431]]
[[0.24993544]
[0.2500646 ]
[2.25019293]]
Like I said, not sure how to move forward ( or backward ;) ) from here.
Below is the full script needed to run the example:
import math
import numpy as np
#everything below is defining activation functions
#--------------------------------------------------------------------------------------------
def b_relu(input):
return max((0, max(input)))
def bd_relu(input):
if(input < 0 or input == 0):
return 0
else:
return 1
def b_sigmoid(x):
return 1 / (1 + math.exp(-x))
def bd_sigmoid(input):
return sigmoid(input) * (1 - sigmoid(input))
def b_tanh(input):
top = (math.exp(input) - math.exp(-input))
bottom = (math.exp(input) + math.exp(-input))
return (top/bottom)
#helper functions for tanh
def cosh(input):
return ((math.exp(input) + math.exp(-input)) / 2)
def sinh(input):
return ((math.exp(input) - math.exp(-input)) / 2)
def bd_tanh(input):
top = (math.pow(cosh(input), 2) - math.pow(sinh(input), 2))
bottom = math.pow(input, 2)
return (top / bottom)
def b_softmax(z):
# subracting the max adds numerical stability
shiftx = z - np.max(z,axis=1)[:,np.newaxis]
exps = np.exp(shiftx)
return exps / np.sum(exps,axis=1)[:,np.newaxis]
def bd_softmax(Y_hat, Y):
return Y_hat - Y
def b_linear(input):
return input
def bd_linear(input):
return 1
#vectorizing the activation and deriv. activation functions
relu = np.vectorize(b_relu)
d_relu = np.vectorize(bd_relu)
sigmoid = np.vectorize(b_sigmoid)
d_sigmoid = np.vectorize(bd_sigmoid)
tanh = np.vectorize(b_tanh)
d_tanh = np.vectorize(bd_tanh)
softmax = np.vectorize(b_softmax)
d_softmax = np.vectorize(bd_softmax)
linear = np.vectorize(b_linear)
d_linear = np.vectorize(bd_linear)
class Model:
def __init__(self, batch, lr):
#initializing self lists to keep track of stuff for bacthes, forward prop & backporp
self.batch = batch
self.lr = lr
self.W = []
self.B = []
self.A = []
self.Z = []
self.X = []
self.layers = []
self.tempW = []
self.tempB = []
#store error for backprop
self.output_error = []
#initialize the weights during 'model.add' so we can test our network shapes dynamically w/out model.compile
#added an output bool here so we can make sure the shape of the output network is (1,n)
def initial_weights(self, input_data, output_shape, output=False):
B = np.zeros((1, output_shape))
#assigning the shape
W = np.random.uniform(-1e-3, 1e-3, size = (input_data.shape[len(input_data.shape) - 1], output_shape))
self.B.append(B)
self.W.append(W)
def add(self, input_data, output_shape, activation, output=False):
#append to layers so we have a correct index value
self.layers.append(69)
#making sure our data in a numpy array
if (type(input_data) == np.ndarray):
X = input_data
else:
X = np.asarray(input_data)
#adding data and activations to self lists
self.X.append(X)
self.A.append(activation)
#keep track of our index & initializing random weights for dynamic comatibility testing
index = len(self.layers)-1
self.initial_weights(input_data, output_shape, output=False)
X2 = self.forward(input_data, index)
#printing layer info
print("Layer:", index)
print("Input Shape: ", X.shape)
print("Weight Shape: ", self.W[index].shape)
print("Output Shape: ", X2.shape)
print(" ")
return(X2)
def forward(self, input_data, index):
#pulling weights and biases from main lists for operations
B = self.B[index]
W = self.W[index]
#matmul of data # weights + bias
Z = np.matmul(input_data, W) + B
#summing each row of inputs to activation node
for x in Z:
x = sum(x)
#pulling activation from index
act = str(self.A[index])
#activating
Z = activate(Z, act)
#keeping track of Z i guess
self.Zappend = Z
return(Z)
def predict(self, input_data):
for x in range(len(self.layers)):
z = model.forward(input_data, x)
input_data = z
return z
def backprop(self, model_output, ground_truth):
#------------------------------
#now begins the backprop portion
#let's start with finding the error between predictions and actual values
#gonna do MSE to keep it simple
self.output_error = (ground_truth - model_output) ** 2
#so now we have the error of the output layer, this tells us two things, how wrong we were, and in which direction we should update
#the outputs of these nodes
'''
What to do if this was linear regression (for m & b)
1. Take the error and multiply it by the transpose of the last layer weights
(I think the error in this case is where the prime activation function should be if we had activations)
2. The last layer bias is just the error
3. The second to last layer inputs is the bias times the transpose of second layers weights
3. Then I have no idea
'''
return self.output_error
I try to reproduce results generated by the LSTMCell from TensorFlow to be sure that I know what it does.
Here is my TensorFlow code:
num_units = 3
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)
timesteps = 7
num_input = 4
X = tf.placeholder("float", [None, timesteps, num_input])
x = tf.unstack(X, timesteps, 1)
outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
x_val = np.random.normal(size = (1, 7, num_input))
res = sess.run(outputs, feed_dict = {X:x_val})
for e in res:
print e
Here is its output:
[[-0.13285545 -0.13569424 -0.23993783]]
[[-0.04818152 0.05927373 0.2558436 ]]
[[-0.13818116 -0.13837864 -0.15348436]]
[[-0.232219 0.08512601 0.05254192]]
[[-0.20371495 -0.14795329 -0.2261929 ]]
[[-0.10371902 -0.0263292 -0.0914975 ]]
[[0.00286371 0.16377522 0.059478 ]]
And here is my own implementation:
n_steps, _ = X.shape
h = np.zeros(shape = self.hid_dim)
c = np.zeros(shape = self.hid_dim)
for i in range(n_steps):
x = X[i,:]
vec = np.concatenate([x, h])
#vec = np.concatenate([h, x])
gs = np.dot(vec, self.kernel) + self.bias
g1 = gs[0*self.hid_dim : 1*self.hid_dim]
g2 = gs[1*self.hid_dim : 2*self.hid_dim]
g3 = gs[2*self.hid_dim : 3*self.hid_dim]
g4 = gs[3*self.hid_dim : 4*self.hid_dim]
I = vsigmoid(g1)
N = np.tanh(g2)
F = vsigmoid(g3)
O = vsigmoid(g4)
c = c*F + I*N
h = O * np.tanh(c)
print h
And here is its output:
[-0.13285543 -0.13569425 -0.23993781]
[-0.01461723 0.08060743 0.30876374]
[-0.13142865 -0.14921292 -0.16898363]
[-0.09892188 0.11739943 0.08772941]
[-0.15569218 -0.15165766 -0.21918869]
[-0.0480604 -0.00918626 -0.06084118]
[0.0963612 0.1876516 0.11888081]
As you might notice I was able to reproduce the first hidden vector, but the second one and all the following ones are different. What am I missing?
i examined this link and your code is almost perfect but you forgot to add forget_bias value(default 1.0) in this line F = vsigmoid(g3) its actualy F = vsigmoid(g3+self.forget_bias) or in your case its 1 F = vsigmoid(g3+1)
here is my imp with numpy:
import numpy as np
import tensorflow as tf
num_units = 3
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)
batch=1
timesteps = 7
num_input = 4
X = tf.placeholder("float", [batch, timesteps, num_input])
x = tf.unstack(X, timesteps, 1)
outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
x_val = np.reshape(range(28),[batch, timesteps, num_input])
res = sess.run(outputs, feed_dict = {X:x_val})
for e in res:
print(e)
print("\nmy imp\n")
#my impl
def sigmoid(x):
return 1/(1+np.exp(-x))
kernel,bias=sess.run([lstm._kernel,lstm._bias])
f_b_=lstm._forget_bias
c,h=np.zeros([batch,num_input-1]),np.zeros([batch,num_input-1])
for step in range(timesteps):
inpt=np.split(x_val,7,1)[step][0]
lstm_mtrx=np.matmul(np.concatenate([inpt,h],1),kernel)+bias
i,j,f,o=np.split(lstm_mtrx,4,1)
c=sigmoid(f+f_b_)*c+sigmoid(i)*np.tanh(j)
h=sigmoid(o)*np.tanh(c)
print(h)
output:
[[ 0.06964055 -0.06541953 -0.00682676]]
[[ 0.005264 -0.03234607 0.00014838]]
[[ 1.617855e-04 -1.316892e-02 8.596722e-06]]
[[ 3.9425286e-06 -5.1347450e-03 7.5078127e-08]]
[[ 8.7508155e-08 -1.9560163e-03 6.3853928e-10]]
[[ 1.8867894e-09 -7.3784427e-04 5.8551406e-12]]
[[ 4.0385355e-11 -2.7728223e-04 5.3957669e-14]]
my imp
[[ 0.06964057 -0.06541953 -0.00682676]]
[[ 0.005264 -0.03234607 0.00014838]]
[[ 1.61785520e-04 -1.31689185e-02 8.59672610e-06]]
[[ 3.94252745e-06 -5.13474567e-03 7.50781122e-08]]
[[ 8.75080644e-08 -1.95601574e-03 6.38539112e-10]]
[[ 1.88678843e-09 -7.37844070e-04 5.85513438e-12]]
[[ 4.03853841e-11 -2.77282006e-04 5.39576024e-14]]
Tensorflow uses glorot_uniform() function to initialize the lstm kernel, which samples weights from a random uniform distribution. We need to fix a value for the kernel to get reproducible results:
import tensorflow as tf
import numpy as np
np.random.seed(0)
timesteps = 7
num_input = 4
x_val = np.random.normal(size = (1, timesteps, num_input))
num_units = 3
def glorot_uniform(shape):
limit = np.sqrt(6.0 / (shape[0] + shape[1]))
return np.random.uniform(low=-limit, high=limit, size=shape)
kernel_init = glorot_uniform((num_input + num_units, 4 * num_units))
My implementation of the LSTMCell (well, actually it's just slightly rewritten tensorflow's code):
def sigmoid(x):
return 1. / (1 + np.exp(-x))
class LSTMCell():
"""Long short-term memory unit (LSTM) recurrent network cell.
"""
def __init__(self, num_units, initializer=glorot_uniform,
forget_bias=1.0, activation=np.tanh):
"""Initialize the parameters for an LSTM cell.
Args:
num_units: int, The number of units in the LSTM cell.
initializer: The initializer to use for the kernel matrix. Default: glorot_uniform
forget_bias: Biases of the forget gate are initialized by default to 1
in order to reduce the scale of forgetting at the beginning of
the training.
activation: Activation function of the inner states. Default: np.tanh.
"""
# Inputs must be 2-dimensional.
self._num_units = num_units
self._forget_bias = forget_bias
self._activation = activation
self._initializer = initializer
def build(self, inputs_shape):
input_depth = inputs_shape[-1]
h_depth = self._num_units
self._kernel = self._initializer(shape=(input_depth + h_depth, 4 * self._num_units))
self._bias = np.zeros(shape=(4 * self._num_units))
def call(self, inputs, state):
"""Run one step of LSTM.
Args:
inputs: input numpy array, must be 2-D, `[batch, input_size]`.
state: a tuple of numpy arrays, both `2-D`, with column sizes `c_state` and
`m_state`.
Returns:
A tuple containing:
- A `2-D, [batch, output_dim]`, numpy array representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is equal to num_units.
- Numpy array(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
"""
num_proj = self._num_units
(c_prev, m_prev) = state
input_size = inputs.shape[-1]
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
lstm_matrix = np.hstack([inputs, m_prev]).dot(self._kernel)
lstm_matrix += self._bias
i, j, f, o = np.split(lstm_matrix, indices_or_sections=4, axis=0)
# Diagonal connections
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
m = sigmoid(o) * self._activation(c)
new_state = (c, m)
return m, new_state
X = x_val.reshape(x_val.shape[1:])
cell = LSTMCell(num_units, initializer=lambda shape: kernel_init)
cell.build(X.shape)
state = (np.zeros(num_units), np.zeros(num_units))
for i in range(timesteps):
x = X[i,:]
output, state = cell.call(x, state)
print(output)
Produces output:
[-0.21386017 -0.08401277 -0.25431477]
[-0.22243588 -0.25817422 -0.1612211 ]
[-0.2282134 -0.14207162 -0.35017249]
[-0.23286737 -0.17129192 -0.2706512 ]
[-0.11768674 -0.20717363 -0.13339118]
[-0.0599215 -0.17756104 -0.2028935 ]
[ 0.11437953 -0.19484555 0.05371994]
While your Tensorflow code, if you replace the second line with
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units, initializer = tf.constant_initializer(kernel_init))
returns:
[[-0.2138602 -0.08401276 -0.25431478]]
[[-0.22243595 -0.25817424 -0.16122109]]
[[-0.22821338 -0.1420716 -0.35017252]]
[[-0.23286738 -0.1712919 -0.27065122]]
[[-0.1176867 -0.2071736 -0.13339119]]
[[-0.05992149 -0.177561 -0.2028935 ]]
[[ 0.11437953 -0.19484554 0.05371996]]
Here is a blog which will answer any conceptual questions related to LSTM's. Seems that there is a lot which goes into building an LSTM from scratch!
Of course, this answer doesn't solve your question but just giving a direction.
Considering Linear Algebra, it's possible to exist a dimension mismatch in the matrix multiplication between I*N (red circle), affecting the output, given that n x m dot m x p will give you a n x p dimensional output.
I got an error when trying to create a simple binary classification for XOR case using Theano. It said dimension mismatch, but I can't find out what variable cause that.
and the strange part, my program is works when I change the number of neuron in the last layer. When I change to use 2 neuron in the last layer, and change that layer to softmax layer, and also use the negative log likelihood (multiclass classification style), this program is works fine.
This is my full code:
import numpy as np
import theano
import theano.tensor as T
class HiddenLayer(object):
def __init__(self, input, nIn, nOut, is_last, W=None):
self.input = input
W_val = np.random.randn(nIn,nOut)*0.001
b_val = np.zeros((nOut,))
self.W = theano.shared(np.asarray(W_val,dtype=theano.config.floatX),
name='W',borrow=True)
self.b = theano.shared(np.asarray(b_val,dtype=theano.config.floatX),
name='b',borrow=True)
self.z = T.dot(input,self.W) + self.b
if(is_last==0):
self.output = T.switch(self.z < 0 , 0 ,self.z)
else:
self.output = T.nnet.sigmoid(self.z)
self.y_pred = self.output > 0.5
self.params = [self.W, self.b]
def cost_function(self,y):
return -T.mean(y*T.log(self.output)+(1-y)*T.log(1-self.output))
def errors(self,y):
return T.mean(T.neq(self.y_pred,y))
alfa = 1
epoch = 1000
neu = 5
inpx = np.array([[1,0],[1,1],[0,0],[0,1]])
inpy = np.array([1,0,0,1])
x = T.fmatrix('x')
y = T.ivector('y')
layer0 = HiddenLayer(
input = x,
nIn = 2,
nOut = neu,
is_last=0
)
layer1 = HiddenLayer(
input = layer0.output,
nIn = neu,
nOut = 1,
is_last=1
)
params = layer0.params + layer1.params
cost = layer1.cost_function(y)
grads = T.grad(cost, params)
updates = [(param_i, param_i - alfa * grad_i) for param_i, grad_i in zip(params, grads)]
eror = layer1.errors(y)
train_model = theano.function([x,y], [eror,cost],updates=updates,allow_input_downcast=True)
test_model = theano.function([x,y],[eror,layer1.y_pred],allow_input_downcast=True)
for i in xrange(epoch):
etr,ctr = train_model(inpx, inpy)
if i%(epoch/10)==0:
print etr,ctr
et,pt = test_model(inpx,inpy)
print pt
and the error:
ValueError: Input dimension mis-match. (input[0].shape[1] = 1, input[1].shape[1] = 4)
Apply node that caused the error: Elemwise{neq,no_inplace}(sigmoid.0, DimShuffle{x,0}.0)
Toposort index: 41
Inputs types: [TensorType(float32, matrix), TensorType(int32, row)]
Inputs shapes: [(4L, 1L), (1L, 4L)]
Inputs strides: [(4L, 4L), (16L, 4L)]
Inputs values: [array([[ 0.94264328],
[ 0.99725735],
[ 0.5 ],
[ 0.95675617]], dtype=float32), array([[1, 0, 0, 1]])]
Outputs clients: [[Shape(Elemwise{neq,no_inplace}.0), Sum{acc_dtype=int64}(Elemwise{neq,no_inplace}.0)]]
Thank you in advance for any help.
Your problem is with your y and inpy variables: what you are trying to do is to have y be the expected output of the network. Your network is given a dataset with 4 elements, each having 2 features, you thus have 4 rows in your input matrix, and 2 columns. You are thus expected to have 4 elements in your predicted output, that is 4 rows in your y or inpy matrix, but you are using a vector, which in theano is a row vector and thus has only one row. You need either to transpose your y vector when computing the cost, or to define your y variable as a matrix, and thus to have inpy as a (4,1) matrix instead of a (4,) vector (once again, vectors are row vectors in theano).
Hope this helps,
Best