I wanna use CNN for 1D data, so decide to use conv1d layer. First two layers goes good, but when I create second conv layer, I have and error:
ValueError: Dimensions must be equal, but are 1 and 586 for 'conv1_43/conv1d/Conv2D' (op: 'Conv2D') with input shapes: [586,1,1040,1], [1,5,586,6].
This is my date shape:
trainX = dataX[0:616]
trainY = dataY[0:616]
testX = dataX[616:646]
testY = dataY[616:646]
trainX = np.expand_dims(trainX, axis=2)
testX = np.expand_dims(testX, axis=2)
#final shapes: train:(586,1040,1) test:(30,1040,1)
There's code:
def new_conv_layer(input, num_input_channels, filter_size, num_filters, name):
with tf.variable_scope(name) as scope:
# Shape of the filter-weights for the convolution
shape = [filter_size, num_input_channels, num_filters]
# Create new weights (filters) with the given shape
weights = tf.Variable(tf.truncated_normal(shape, stddev=0.05))
# Create new biases, one for each filter
#biases = tf.Variable(tf.constant(0.05, shape=[num_filters]))
# TensorFlow operation for convolution
layer = tf.nn.conv1d(input, weights, 1, 'SAME')
# Add the biases to the results of the convolution.
#layer += biases
return layer, weights
# Function for creating a new ReLU Layer
def new_relu_layer(input, name):
with tf.variable_scope(name) as scope:
# TensorFlow operation for convolution
layer = tf.nn.relu(input)
return layer
# Convolutional Layer 1
layer_conv1, weights_conv1 = new_conv_layer(trainX, num_input_channels=586, filter_size=5, num_filters=6, name ="conv1")
# Pooling Layer 1new_pool_layer
layer_pool1 = max_pooling1d(layer_conv1, 3, 1, name="pool1")
# RelU layer 1
layer_relu1 = new_relu_layer(layer_pool1, name="relu1")
# Convolutional Layer 2
layer_conv2, weights_conv2 = new_conv_layer(input=layer_relu1, num_input_channels=1, filter_size=5, num_filters=16, name= "conv2")
# Pooling Layer 2
layer_pool2 = max_pooling1d(layer_conv2, 2, 1, name="pool2")
# RelU layer 2
layer_relu2 = new_relu_layer(layer_pool2, name="relu2")
What is a problem?
The input and kernel sizes to the nn.conv1d is not right.
From the API doc,
Input tensor should be of shape: [batch, in_width, num_input_channels]
Kernel/weights should be of shape: [filter_width, num_input_channels,
out_channels]
The inputs are of shape [586,1,1040,1] and should be [586, 1040, 1], and the kernel has the num_input_channels defined wrongly when calling the new_conv_layer. It should be 1 in the first call and 6 in the next call.
Related
How to solve this error?
Preprocessing of image:
def PreprocessData(img, mask, target_shape_img, target_shape_mask, path1, path2):
"""
Processes the images and mask present in the shared list and path
Returns a NumPy dataset with images as 3-D arrays of desired size
"""
# Pull the relevant dimensions for image and mask
m = len(img) # number of images
i_h,i_w,i_c = target_shape_img # pull height, width, and channels of image
m_h,m_w,m_c = target_shape_mask # pull height, width, and channels of mask
# Define X and Y as number of images along with shape of one image
X = np.zeros((m,i_h,i_w,1), dtype=np.float32)
y = np.zeros((m,m_h,m_w,1), dtype=np.int32)
# RGBA image has 4 channels.
#255 will make the pixel completely opaque,
#value 0 fully transparent,
#values in between will make the pixels partly transparent
# Resize images and masks
for file in img:
# convert image into an array of desired shape (3 channels)
index = img.index(file)
path = os.path.join(path1, file)
single_img = np.asarray(Image.open(path).resize((i_h,i_w))) # (0.21, 0.75, 0.04)
#single_img = np.reshape(single_img,(i_h,i_w,i_c))
single_img = single_img/255.
X[index] = single_img[..., None] #X (dims: # images, img height, img width, img channels)
# convert mask into an array of desired shape 4 channel
single_mask_ind = mask[index]
path = os.path.join(path2, single_mask_ind)
single_mask = np.asarray(Image.open(path).resize((i_h,i_w)))
single_mask = single_mask > 0 #binarizing of targets
# single_mask = single_mask - 1 ### single_mask = single_mask/256???
y[index] = single_mask[..., None] #y (dims: # masks, mask height, mask width, mask channels)
return X, y
Encoder:
def EncoderMiniBlock(inputs, n_filters=32, dropout_prob=0.3, max_pooling=True):
"""
This block uses multiple convolution layers, max pool, relu activation to create an architecture for learning.
Dropout can be added for regularization to prevent overfitting.
The block returns the activation values for next layer along with a skip connection which will be used in the decoder
"""
# Add 2 Conv Layers with relu activation and HeNormal initialization using TensorFlow
# Proper initialization prevents from the problem of exploding and vanishing gradients
# 'Same' padding will pad the input to conv layer such that the output has the same height and width (hence, is not reduced in size)
conv = Conv2D(n_filters,
3, # Kernel size
activation='relu',
padding='same',
kernel_initializer='HeNormal')(inputs)
conv = Conv2D(n_filters,
3, # Kernel size
activation='relu',
padding='same',
kernel_initializer='HeNormal')(conv)
# Batch Normalization will normalize the output of the last layer based on the batch's mean and standard deviation
conv = BatchNormalization()(conv, training=False)
# In case of overfitting, dropout will regularize the loss and gradient computation to shrink the influence of weights on output
if dropout_prob > 0:
conv = tf.keras.layers.Dropout(dropout_prob)(conv)
# Pooling reduces the size of the image while keeping the number of channels same
# Pooling has been kept as optional as the last encoder layer does not use pooling (hence, makes the encoder block flexible to use)
# Below, Max pooling considers the maximum of the input slice for output computation and uses stride of 2 to traverse across input image
if max_pooling:
next_layer = tf.keras.layers.MaxPooling2D(pool_size = (2,2))(conv)
else:
next_layer = conv
# skip connection (without max pooling) will be input to the decoder layer to prevent information loss during transpose convolutions
skip_connection = conv
return next_layer, skip_connection
Decoder:
def DecoderMiniBlock(prev_layer_input, skip_layer_input, n_filters=32):
"""
Decoder Block first uses transpose convolution to upscale the image to a bigger size and then,
merges the result with skip layer results from encoder block
Adding 2 convolutions with 'same' padding helps further increase the depth of the network for better predictions
The function returns the decoded layer output
"""
# Start with a transpose convolution layer to first increase the size of the image
up = Conv2DTranspose(
n_filters,
(3,3), # Kernel size
strides=(2,2),
padding='same')(prev_layer_input)
# Merge the skip connection from previous block to prevent information loss
merge = concatenate([up, skip_layer_input], axis=3)
# Add 2 Conv Layers with relu activation and HeNormal initialization for further processing
# The parameters for the function are similar to encoder
conv = Conv2D(n_filters,
3, # Kernel size
activation='relu',
padding='same',
kernel_initializer='HeNormal')(merge)
conv = Conv2D(n_filters,
3, # Kernel size
activation='relu',
padding='same',
kernel_initializer='HeNormal')(conv)
return conv
U-Net compilation
def UNetCompiled(input_size=(128, 128, 3), n_filters=32, n_classes=3):
"""
Combine both encoder and decoder blocks according to the U-Net research paper
Return the model as output
"""
# Input size represent the size of 1 image (the size used for pre-processing)
inputs = Input(input_size)
# Encoder includes multiple convolutional mini blocks with different maxpooling, dropout and filter parameters
# Observe that the filters are increasing as we go deeper into the network which will increase the # channels of the image
cblock1 = EncoderMiniBlock(inputs, n_filters,dropout_prob=0, max_pooling=True)
cblock2 = EncoderMiniBlock(cblock1[0],n_filters*2,dropout_prob=0, max_pooling=True)
cblock3 = EncoderMiniBlock(cblock2[0], n_filters*4,dropout_prob=0, max_pooling=True)
cblock4 = EncoderMiniBlock(cblock3[0], n_filters*8,dropout_prob=0.3, max_pooling=True)
cblock5 = EncoderMiniBlock(cblock4[0], n_filters*16, dropout_prob=0.3, max_pooling=False)
# Decoder includes multiple mini blocks with decreasing number of filters
# Observe the skip connections from the encoder are given as input to the decoder
# Recall the 2nd output of encoder block was skip connection, hence cblockn[1] is used
ublock6 = DecoderMiniBlock(cblock5[0], cblock4[1], n_filters * 8)
ublock7 = DecoderMiniBlock(ublock6, cblock3[1], n_filters * 4)
ublock8 = DecoderMiniBlock(ublock7, cblock2[1], n_filters * 2)
ublock9 = DecoderMiniBlock(ublock8, cblock1[1], n_filters)
# Complete the model with 1 3x3 convolution layer (Same as the prev Conv Layers)
# Followed by a 1x1 Conv layer to get the image to the desired size.
# Observe the number of channels will be equal to number of output classes
conv9 = Conv2D(n_filters,
3,
activation='relu',
padding='same',
kernel_initializer='he_normal')(ublock9)
conv10 = Conv2D(n_classes, 1, padding='same')(conv9)
# Define the model
model = tf.keras.Model(inputs=inputs, outputs=conv10)
return model
Define the desired shape
target_shape_img = [128, 128, 3]
target_shape_mask = [128, 128,1]
Process data using apt helper function
X, y = PreprocessData(img, mask, target_shape_img, target_shape_mask, path1, path2)
I am not able to understand what is wrong because I am getting this error:
ValueError: in user code:
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py",
line 1021, in train_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py",
line 1010, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py",
line 1000, in run_step **
outputs = model.train_step(data)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py",
line 859, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py",
line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.7/dist-packages/keras/engine/input_spec.py",
line 249, in assert_input_compatibility
f'Input {input_index} of layer "{layer_name}" is '
ValueError: Exception encountered when calling layer "model" (type Functional).
Input 0 of layer "conv2d" is incompatible with the layer: expected axis -1 of input shape to have value 3, but received input with shape None, 128, 128, 1)
Call arguments received:
• inputs=tf.Tensor(shape=(None, 128, 128, 1), dtype=float32)
• training=True
• mask=None
You seem to have defined a model that takes inputs of shape (128,128,3) and are inputting shape (128,128,1). If you change the input shape when you define the UNetCompiled function, it should solve the issue.
def UNetCompiled(input_size=(128, 128, 1), n_filters=32, n_classes=3):
Or you could change the input shape in the PreprocessData function if the images are colour and not greyscale images
You have defined the images as having 1 channel
# Define X and Y as number of images along with shape of one image
X = np.zeros((m,i_h,i_w,1), dtype=np.float32)
y = np.zeros((m,m_h,m_w,1), dtype=np.int32)
but in the next line have written # RGBA image has 4 channels.
If your input image has 4 channels, both the images and the model input_shape needs to reflect this
I've been trying to get a CNN working for the Omligot dataset (105 x 105 x 1 images) via two tutorials I found: CNN tutorial 1 and CNN tutorial 2, working on the usual MNIST dataset (28 x 28 x 1 images).
I'm still struggling with a shaping conflict in the implementation (after a week with on- & off time for debugging) No one was able to provide any help thus far, and in the mean time I could debug in a way, where I think I can provide better description of the shaping error.
The majority of my code is as follows (just skipping few irrelevant stuff here and there). So my placeholders are defined as follows right:
x = tf.placeholder(tf.float32, shape=(None, 105, 105, 1) ) # placeholder for train data
y = tf.placeholder(tf.float32, shape=(None, 20) ) # placeholder for labels
lr = tf.placeholder(tf.float32,shape=(), name="learnRate") # for varying learning rates during training
label dimension of 20 given that there's 20 different characters per alphabet. So one-hot-encoded vector of length 20.
From here on out my model is defined as follow (where I commented the dimension results from each output):
# weight and bias dimension definitions
self.ConFltSize = 3
self.ConOutSize = 7
self.weights = {
'wc1': tf.Variable(tf.random_normal([self.ConFltSize,self.ConFltSize,1, 32], stddev=0.01, name='W0')),
'wc2': tf.Variable(tf.random_normal([self.ConFltSize,self.ConFltSize,32, 64], stddev=0.01, name='W1')),
'wc3': tf.Variable(tf.random_normal([self.ConFltSize,self.ConFltSize,64, 128], stddev=0.01, name='W2')),
'wd1': tf.Variable(tf.random_normal([self.ConOutSize * self.ConOutSize * 128, 128], stddev=0.01, name='W3')),
'out': tf.Variable(tf.random_normal([128, self.InLabels.shape[1]], stddev=0.01, name='W4')),
}
self.biases = {
'bc1': tf.Variable(tf.random_normal([32], stddev=0.01, name='B0')),
'bc2': tf.Variable(tf.random_normal([64], stddev=0.01, name='B1')),
'bc3': tf.Variable(tf.random_normal([128], stddev=0.01, name='B2')),
'bd1': tf.Variable(tf.random_normal([128], stddev=0.01, name='B3')),
'out': tf.Variable(tf.random_normal([self.InLabels.shape[1]], stddev=0.01, name='B4')),
}
# Model definition + shaping results
# x = provide the input data
# weights = dictionary variables for weights
# biases = dictionary variables for biases
def Architecture(self, x, weights, biases):
conv1 = self.conv(x, weights['wc1'], biases['bc1']) # convolution layer 1
conv1 = self.maxPool(conv1) # max pool layer 1
# out shape -> [None, 53, 53, 32]
conv2 = self.conv(conv1, weights['wc2'], biases['bc2']) # convolution layer 2
conv2 = self.maxPool(conv2) # max pool layer 2
# out shape -> [None, 27, 27, 64]
conv3 = self.conv(conv2, weights['wc3'], biases['bc3']) # convolution layer 3
conv3 = self.maxPool(conv3) # max pool layer 3
# out shape -> [None, 14, 14, 128]
flayer = tf.reshape(conv3, [-1, weights['wd1'].shape[0]]) # flatten the output from convo layer
# for 7 x 7 x 128 this is -> [None, 6272]
flayer = tf.add(tf.matmul(flayer, weights['wd1']), biases['bd1']) # fully connected layer 1
flayer = tf.nn.relu(flayer)
# out shape -> [None, 128]
out = tf.add( tf.matmul(flayer, weights['out']), biases['out'] ) # do last set of output weight * vals + bias
# out shape -> [None, 20]
return out # net input to output layer
But so now, in my main program I feed input data to my model in batches, basically with:
out = self.Architecture(x, self.weights, self.biases) # Implement network architecture, and get output tensor (net input to output layer)
# normalize, softmax and entropy the net input, in comparison with provided labels
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=out, labels=y) ) # cost function
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(cost) # gradient descent optimizer
pred = tf.equal(tf.argmax(out, 1), tf.argmax(y , 1)) # output true / false if predicted value matches label
accuracy = tf.reduce_mean(tf.cast(pred, tf.float32)) # percentage value of correct predictions
for i in range(iters):
[BX, _, BY, _] = batch.split(trainX, trainY, Bsize) # random split in batch size
# data shapes: BX -> [160, 105, 105, 1], BY -> [160, 20]
# Code bombs out after feeding with input data
opt = sess.run(optimizer, feed_dict={lr:learnr, x:BX, y:BY } )
The exception what I then get after with the sess.run command is:
'logits and labels must be broadcastable: logits_size=[640,20] labels_size=[160,20]\n\t [[Node: softmax_cross_entropy_with_logits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Add_1, softmax_cross_entropy_with_logits/Reshape_1)]]'
From this I interpret that the softmax is getting [640, 20] as input while it's expecting [160, 20]... I do not understand how, where the data could be shaped to [640, 20]???
Please show me if I'm missing something, or misinterpreting the error?
In a classification problem with many classes, tensorflow docs suggests using sampled_softmax_loss over a simple softmax to reduce training runtime.
According to the docs and source (line 1180), the call pattern for sampled_softmax_loss is:
tf.nn.sampled_softmax_loss(weights, # Shape (num_classes, dim) - floatXX
biases, # Shape (num_classes) - floatXX
labels, # Shape (batch_size, num_true) - int64
inputs, # Shape (batch_size, dim) - floatXX
num_sampled, # - int
num_classes, # - int
num_true=1,
sampled_values=None,
remove_accidental_hits=True,
partition_strategy="mod",
name="sampled_softmax_loss")
It's unclear (at least to me) how to convert a real world problem into the shapes that this loss function requires. I think the 'inputs' field is the problem.
Here is a copy-paste-ready minimum working example that throws a matrix multiplication shape error when calling the loss function.
import tensorflow as tf
# Network Parameters
n_hidden_1 = 256 # 1st layer number of features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# Dependent & Independent Variable Placeholders
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes]) #
# Weights and Biases
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Super simple model builder
def tiny_perceptron(x, weights, biases):
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
return out_layer
# Create the model
pred = tiny_perceptron(x, weights, biases)
# Set up loss function inputs and inspect their shapes
w = tf.transpose(weights['out'])
b = biases['out']
labels = tf.reshape(tf.argmax(y, 1), [-1,1])
inputs = pred
num_sampled = 3
num_true = 1
num_classes = n_classes
print('Shapes\n------\nw:\t%s\nb:\t%s\nlabels:\t%s\ninputs:\t%s' % (w.shape, b.shape, labels.shape, inputs.shape))
# Shapes
# ------
# w: (10, 256) # Requires (num_classes, dim) - CORRECT
# b: (10,) # Requires (num_classes) - CORRECT
# labels: (?, 1) # Requires (batch_size, num_true) - CORRECT
# inputs: (?, 10) # Requires (batch_size, dim) - Not sure
loss_function = tf.reduce_mean(tf.nn.sampled_softmax_loss(
weights=w,
biases=b,
labels=labels,
inputs=inputs,
num_sampled=num_sampled,
num_true=num_true,
num_classes=num_classes))
The final line triggers and ValueError, stating that you cant multiply tensors with shape (?,10) and (?,256). As a general rule, I'd agree with that statement. Full error shown below:
ValueError: Dimensions must be equal, but are 10 and 256 for 'sampled_softmax_loss_2/MatMul_1' (op: 'MatMul') with input shapes: [?,10], [?,256].
If the 'dim' value from tensorflow docs is intended to be constant, either the 'weights' or 'inputs' variables going into the loss function are incorrect.
Any thoughts would be awesome, I'm totally stumped on how to use this loss function correctly & it would have a huge impact on training time for the model we're using it for (500k classes). Thanks!
---EDIT---
It is possible to get the sample shown above to run without errors by playing with parameters and ignoring the sampled_softmax_loss call pattern's expected inputs. If you do that, it results in a trainable model that has 0 impact on prediction accuracy (as you would expect).
In your softmax layer you are multiplying your network predictions, which have dimension (num_classes,) by your w matrix which has dimension (num_classes, num_hidden_1), so you end up trying to compare your target labels of size (num_classes,) to something that is now size (num_hidden_1,). Change your tiny perceptron to output layer_1 instead, then change the definition of your cost. The code below might do the trick.
def tiny_perceptron(x, weights, biases):
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
return layer_1
layer_1 = tiny_perceptron(x, weights, biases)
loss_function = tf.reduce_mean(tf.nn.sampled_softmax_loss(
weights=weights['h1'],
biases=biases['b1'],
labels=labels,
inputs=layer_1,
num_sampled=num_sampled,
num_true=num_true,
num_classes=num_classes))
When you train your network with some optimizer, you will tell it to minimize loss_function, which should mean that it will adjust both sets of weights and biases.
The key point is to pass right shape of weight, bias, input and label. The shape of weight passed to sampled_softmax is not the the same with the general situation.
For example, logits = xw + b, call sampled_softmax like this:
sampled_softmax(weight=tf.transpose(w), bias=b, inputs=x), NOT sampled_softmax(weight=w, bias=b, inputs=logits)!!
Besides, label is not one-hot representation. if your labels are one-hot represented, pass labels=tf.reshape(tf.argmax(labels_one_hot, 1), [-1,1])
I am implementing an RNN and contrarily to the examples I have found which minimize only the cost for the output in the last step
x = tf.placeholder ("float", [features_dimension, None, n_timesteps])
y = tf.placeholder ("float", [labels_dimension, None, n_timesteps])
# Define weights
weights = {'out': tf.Variable (tf.random_normal ([N_HIDDEN, labels_dimension]))}
biases = {'out': tf.Variable (tf.random_normal ([labels_dimension]))}
def RNN (x, weights, biases):
# Prepare data shape to match `rnn` function requirements
# Current data input shape: (features_dimension, BATCH_SIZE, n_timesteps)
# Required shape: `n_timesteps` tensors list of shape (BATCH_SIZE, features_dimension)
# We make a division of the data to split it in individual vectors that
# will be fed for each timestep
# Permuting features_dimension and n_timesteps
# Shape will be (n_timesteps, BATCH_SIZE, features_dimension)
x = tf.transpose (x, [2, 1, 0])
# Reshaping to (BATCH_SIZE*n_timesteps, features_dimension) (we are removing the depth dimension with this)
x = tf.reshape(x, [BATCH_SIZE*n_timesteps, features_dimension])
# Split the previous 2D tensor to get a list of `n_timesteps` tensors of
# shape (batch_size, features_dimension).
x = tf.split (x, n_timesteps, 0)
# Define a lstm cell with tensorflow
lstm_cell = rnn.BasicLSTMCell (N_HIDDEN, forget_bias=1.0)
# Get lstm cell output
outputs, states = rnn.static_rnn (lstm_cell, x, dtype=tf.float32)
# Linear activation; outputs contains the array of outputs for all the
# timesteps
pred = tf.matmul (outputs, weights['out']) + biases['out']
However, the object outputs is a list of Tensor with n_timesteps elements, so the pred = tf.matmul (outputs, weights['out']) + biases['out'] throws the error
ValueError: Shape must be rank 2 but is rank 3 for 'MatMul' (op:
'MatMul') with input shapes: [100,128,16], [16,1].
. How can I do this multiplication?
The solution is to tf.stack the list of tensors into a 3d tensor and then use tf.map_fn to apply the multiplication operation on each 2d tensor along dimension 0:
# Transform the list into a 3D tensor with dimensions (n_timesteps, batch_size, N_HIDDEN)
outputs = tf.stack(outputs)
def pred_fn(current_output):
return tf.matmul(current_output, weights['out']) + biases['out']
# Use tf.map_fn to apply pred_fn to each tensor in outputs, along dimension 0 (timestep dimension)
pred = tf.map_fn(pred_fn, outputs)
I find two kinds of implementations of RNN in tensorflow.
The first implementations is this (from line 124 to 129). It uses a loop to define each step of input in RNN.
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()
(cell_output, state) = cell(inputs[:, time_step, :], state)
outputs.append(cell_output)
states.append(state)
The second implementation is this (from line 51 to 70). It doesn't use any loop to define each step of input in RNN.
def RNN(_X, _istate, _weights, _biases):
# input shape: (batch_size, n_steps, n_input)
_X = tf.transpose(_X, [1, 0, 2]) # permute n_steps and batch_size
# Reshape to prepare input to hidden activation
_X = tf.reshape(_X, [-1, n_input]) # (n_steps*batch_size, n_input)
# Linear activation
_X = tf.matmul(_X, _weights['hidden']) + _biases['hidden']
# Define a lstm cell with tensorflow
lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
# Split data because rnn cell needs a list of inputs for the RNN inner loop
_X = tf.split(0, n_steps, _X) # n_steps * (batch_size, n_hidden)
# Get lstm cell output
outputs, states = rnn.rnn(lstm_cell, _X, initial_state=_istate)
# Linear activation
# Get inner loop last output
return tf.matmul(outputs[-1], _weights['out']) + _biases['out']
In the first implementation, I find there is no weight matrix between input unit to hidden unit, only define weight matrix between hidden unit to out put unit (from line 132 to 133)..
output = tf.reshape(tf.concat(1, outputs), [-1, size])
softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
softmax_b = tf.get_variable("softmax_b", [vocab_size])
logits = tf.matmul(output, softmax_w) + softmax_b
But in the second implementation, both of the weight matrices are defined (from line 42 to 47).
weights = {
'hidden': tf.Variable(tf.random_normal([n_input, n_hidden])), # Hidden layer weights
'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
}
biases = {
'hidden': tf.Variable(tf.random_normal([n_hidden])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
I wonder why?
The difference I noticed is that the code in the second implementation uses tf.nn.rnn which takes list of inputs for each time step and generated the list of outputs for each time step.
(Inputs: A length T list of inputs, each a tensor of shape
[batch_size, input_size].)
So, if you check the code in the second implementation on line 62 the input data is shaped into n_steps * (batch_size, n_hidden)
# Split data because rnn cell needs a list of inputs for the RNN inner loop
_X = tf.split(0, n_steps, _X) # n_steps * (batch_size, n_hidden)
In the 1st implementation they are looping through the n_time_steps and providing the input and get the corresponding output and storing in the outputs list.
Code snippet from line 113 to 117
outputs = []
state = self._initial_state
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()
(cell_output, state) = cell(inputs[:, time_step, :], state)
outputs.append(cell_output)
Coming to your second question:
If you carefully notice the way the inputs are being fed to the RNN in both the implementations.
In the first implementation the inputs are already of shape batch_size x num_steps (here num_steps is hidden size):
self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
Whereas in the second implementation the initial inputs are of shape (batch_size x n_steps x n_input). So a weight matrix is required to transform to the shape (n_steps x batch_size x hidden_size):
# Input shape: (batch_size, n_steps, n_input)
_X = tf.transpose(_X, [1, 0, 2]) # Permute n_steps and batch_size
# Reshape to prepare input to hidden activation
_X = tf.reshape(_X, [-1, n_input]) # (n_steps*batch_size, n_input)
# Linear activation
_X = tf.matmul(_X, _weights['hidden']) + _biases['hidden']
# Split data because rnn cell needs a list of inputs for the RNN inner loop
_X = tf.split(0, n_steps, _X) # n_steps * (batch_size, n_hidden)
I hope this is helpful...