Tensorflow r0.12's documentation for tf.nn.rnn_cell.LSTMCell describes this as the init:
tf.nn.rnn_cell.LSTMCell.__call__(inputs, state, scope=None)
where state is as follows:
state: if state_is_tuple is False, this must be a state Tensor, 2-D, batch x state_size. If state_is_tuple is True, this must be a tuple of state Tensors, both 2-D, with column sizes c_state and m_state.
What aare c_state and m_state and how do they fit into LSTMs? I cannot find reference to them anywhere in the documentation.
Here is a link to that page in the documentation.
I agree that the documentation is unclear. Looking at tf.nn.rnn_cell.LSTMCell.__call__ clarifies (I took the code from TensorFlow 1.0.0):
def __call__(self, inputs, state, scope=None):
"""Run one step of LSTM.
Args:
inputs: input Tensor, 2D, batch x num_units.
state: if `state_is_tuple` is False, this must be a state Tensor,
`2-D, batch x state_size`. If `state_is_tuple` is True, this must be a
tuple of state Tensors, both `2-D`, with column sizes `c_state` and
`m_state`.
scope: VariableScope for the created subgraph; defaults to "lstm_cell".
Returns:
A tuple containing:
- A `2-D, [batch x output_dim]`, Tensor representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is:
num_proj if num_proj was set,
num_units otherwise.
- Tensor(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
Raises:
ValueError: If input size cannot be inferred from inputs via
static shape inference.
"""
num_proj = self._num_units if self._num_proj is None else self._num_proj
if self._state_is_tuple:
(c_prev, m_prev) = state
else:
c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
dtype = inputs.dtype
input_size = inputs.get_shape().with_rank(2)[1]
if input_size.value is None:
raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
with vs.variable_scope(scope or "lstm_cell",
initializer=self._initializer) as unit_scope:
if self._num_unit_shards is not None:
unit_scope.set_partitioner(
partitioned_variables.fixed_size_partitioner(
self._num_unit_shards))
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True,
scope=scope)
i, j, f, o = array_ops.split(
value=lstm_matrix, num_or_size_splits=4, axis=1)
# Diagonal connections
if self._use_peepholes:
with vs.variable_scope(unit_scope) as projection_scope:
if self._num_unit_shards is not None:
projection_scope.set_partitioner(None)
w_f_diag = vs.get_variable(
"w_f_diag", shape=[self._num_units], dtype=dtype)
w_i_diag = vs.get_variable(
"w_i_diag", shape=[self._num_units], dtype=dtype)
w_o_diag = vs.get_variable(
"w_o_diag", shape=[self._num_units], dtype=dtype)
if self._use_peepholes:
c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
sigmoid(i + w_i_diag * c_prev) * self._activation(j))
else:
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
if self._cell_clip is not None:
# pylint: disable=invalid-unary-operand-type
c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
# pylint: enable=invalid-unary-operand-type
if self._use_peepholes:
m = sigmoid(o + w_o_diag * c) * self._activation(c)
else:
m = sigmoid(o) * self._activation(c)
if self._num_proj is not None:
with vs.variable_scope("projection") as proj_scope:
if self._num_proj_shards is not None:
proj_scope.set_partitioner(
partitioned_variables.fixed_size_partitioner(
self._num_proj_shards))
m = _linear(m, self._num_proj, bias=False, scope=scope)
if self._proj_clip is not None:
# pylint: disable=invalid-unary-operand-type
m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
# pylint: enable=invalid-unary-operand-type
new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
array_ops.concat([c, m], 1))
return m, new_state
The key lines are:
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
and
m = sigmoid(o) * self._activation(c)
and
new_state = (LSTMStateTuple(c, m)
If you compare the code to compute c and m with the LSTM equations (see below), you can see it corresponds to the cell state (typically denoted with c) and hidden state (typically denoted with h), respectively:
new_state = (LSTMStateTuple(c, m) indicates that the first element of the returned state tuple is c (cell state a.k.a. c_state), and the second element of the returned state tuple is m (hidden state a.k.a. m_state).
I've stumbled upon same question, here's how I understand it! Minimalistic LSTM example:
import tensorflow as tf
sample_input = tf.constant([[1,2,3]],dtype=tf.float32)
LSTM_CELL_SIZE = 2
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=True)
state = (tf.zeros([1,LSTM_CELL_SIZE]),)*2
output, state_new = lstm_cell(sample_input, state)
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)
print sess.run(output)
Notice that state_is_tuple=True so when passing state to this cell, it needs to be in the tuple form. c_state and m_state are probably "Memory State" and "Cell State", though I honestly am NOT sure, as these terms are only mentioned in the docs. In the code and papers about LSTM - letters h and c are commonly used to denote "output value" and "cell state".
http://colah.github.io/posts/2015-08-Understanding-LSTMs/
Those tensors represent combined internal state of the cell, and should be passed together. Old way to do it was to simply concatenate them, and new way is to use tuples.
OLD WAY:
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=False)
state = tf.zeros([1,LSTM_CELL_SIZE*2])
output, state_new = lstm_cell(sample_input, state)
NEW WAY:
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=True)
state = (tf.zeros([1,LSTM_CELL_SIZE]),)*2
output, state_new = lstm_cell(sample_input, state)
So, basically all we did, is changed state from being 1 tensor of length 4 into two tensors of length 2. The content remained the same. [0,0,0,0] becomes ([0,0],[0,0]). (This is supposed to make it faster)
Maybe this excerpt from the code will help
def __call__(self, inputs, state, scope=None):
"""Long short-term memory cell (LSTM)."""
with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell"
# Parameters of gates are concatenated into one multiply for efficiency.
if self._state_is_tuple:
c, h = state
else:
c, h = array_ops.split(1, 2, state)
concat = _linear([inputs, h], 4 * self._num_units, True)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
i, j, f, o = array_ops.split(1, 4, concat)
new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
self._activation(j))
new_h = self._activation(new_c) * sigmoid(o)
if self._state_is_tuple:
new_state = LSTMStateTuple(new_c, new_h)
else:
new_state = array_ops.concat(1, [new_c, new_h])
return new_h, new_state
https://github.com/tensorflow/tensorflow/blob/r1.2/tensorflow/python/ops/rnn_cell_impl.py
Line #308 - 314
class LSTMStateTuple(_LSTMStateTuple):
"""Tuple used by LSTM Cells for state_size, zero_state, and output state.
Stores two elements: (c, h), in that order.
Only used when state_is_tuple=True.
"""
Related
I wanna implement the backward propagation concept in python with the next code
class MLP(object):
def __init__(self, num_inputs=3, hidden_layers=[3, 3], num_outputs=2):
self.num_inputs = num_inputs
self.hidden_layers = hidden_layers
self.num_outputs = num_outputs
layers = [num_inputs] + hidden_layers + [num_outputs]
weights = []
bias = []
for i in range(len(layers) - 1):
w = np.random.rand(layers[i], layers[i + 1])
b=np.random.randn(layers[i+1]).reshape(1, layers[i+1])
weights.append(w)
bias.append(b)
self.weights = weights
self.bias = bias
activations = []
for i in range(len(layers)):
a = np.zeros(layers[i])
activations.append(a)
self.activations = activations
def forward_propagate(self, inputs):
activations = inputs
self.activations[0] = activations
for i, w in enumerate(self.weights):
for j, b in enumerate(self.bias):
net_inputs = self._sigmoid((np.dot(activations, w)+b))
self.activations[i + 1] = net_inputs
return activations
def train(self, inputs, targets, epochs, learning_rate):
for i in range(epochs):
sum_errors = 0
for j, input in enumerate(inputs):
target = targets[j]
output = self.forward_propagate(input)
def _sigmoid(self, x):
y = 1.0 / (1 + np.exp(-x))
return y
So I created the next dummy data in order to verify everything is correct
items = np.array([[random()/2 for _ in range(2)] for _ in range(1000)])
targets = np.array([[i[0] + i[1]] for i in items])
mlp = MLP(2, [5], 1)
mlp.train(items, targets, 2, 0.1)
but when I run the code I have the next error
ValueError: shapes (2,) and (5,1) not aligned: 2 (dim 0) != 5 (dim 0)
I understand the error, but how to solve it?
a couple of major problems with forward_propagate:
change net_inputs to activations - otherwise you always compute and return the activations from the first layer
remove for j, b in enumerate(self.bias): - biases from other layers have no business here
use matmul instead of dot
so, something like
for i, w in enumerate(self.weights):
activations = self._sigmoid((np.matmul(activations, w)+self.bias[i]))
self.activations[i + 1] = activations
return activations
Also, be careful to note that this method receives 1D array, which converts to a matrix after the first matmul. Matrixes are stored in self.activations and a matrix is returned from the method.
This might or might not be what you want.
TLDR
The input shape of each sample for my DoubleDuelingDQN is (169, 3). The output of that DDDQN shall be of shape (3) for 3 corresponding actions. Currently, when I call
next_qs_list = self.target_network(next_states).numpy()
..the output shape is (64, 169, 3) for batch_size=64. My assumption is, that the output shape is wrong and should be (64, 3).
My NN is currently configured like below (where it's call() function returns the wrong shape) - How would I need to build my network to return the correct shape (3) instead of (169,3)?:
class DuelingDeepQNetwork(keras.Model):
def __init__(self, n_actions, neurons_1, neurons_2, neurons_3=None):
super(DuelingDeepQNetwork, self).__init__()
self.dens_1 = keras.layers.Dense(neurons_1, activation='relu', input_dim=(169,31,)) # Here I added input_dim which is not present in my LunarLander Agent
self.dens_2 = keras.layers.Dense(neurons_2, activation='relu')
if neurons_3:
self.dens_3 = keras.layers.Dense(neurons_3, activation='relu')
self.V = keras.layers.Dense(1, activation=None) # Value layer
self.A = keras.layers.Dense(n_actions, activation=None) # Advantage layer
def call(self, state):
x = self.dens_1(state)
x = self.dens_2(x)
if self.dens_3:
x = self.dens_3(x)
V = self.V(x)
A = self.A(x)
Q = V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True))
return Q
def advantage(self, state):
x = self.dens_1(state)
x = self.dens_2(x)
if self.dens_3:
x = self.dens_3(x)
A = self.A(x)
return A
Updated error message:
ValueError: non-broadcastable output operand with shape (3,) doesn't match the broadcast shape (3,3)
is raised on the last line of:
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx]
tmp1 = self.gamma * next_qs_list[idx, max_actions[idx]]
target_qs_list[idx, actions[idx]] += tmp1 * (1-int(dones[idx]))
Initial Post:
I have (kind of) finished my Custom RL Environment respecting the OpenAI Gym concept. Basically the environment is a TimeSeries of OHLCV Cryptoprices and env.reset() returns a windows of shape (169, 31) - 169 TimeSteps and 31 Features. With env.step() the agent's observation window wanders one TimeStep on. I want start with 3 possible actions (do nothing / buy / sell)
self.action_space = spaces.Discrete(3)
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.HISTORY_LENGTH+1, self.df_sample.shape[1]), dtype=np.float32)
# (169, 31)
Now I am failing on the migration of my existing DQN-Agent from LunarLander-v2 (created following multiple Tutorials on Youtube and Medium). I assume that my DQNetwork and/or MemoryBuffer are not formatted correctly. I begin with filling up my memory with 1,000 samples on random actions. Then Training begins and on with the agent.learn() call, the following Error is raised, which I am unable to interpret and which is the reason for this question for help.
TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got <tf.Tensor: shape=(3,), dtype=int64, numpy=array([144, 165, 2])>
(deleted obsolete text)
To be exactly this update loop raises the error:
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx] + self.gamma * next_qs_list[idx, max_actions[idx]] * (1-int(dones[idx]))
Since my debugging skills and knowledge on python+keras+TF have found an end here, I appreciate any help.
Here is the code of my Agent. If more code or information is needed, I will happily provide more info.
class ReplayBuffer():
def __init__(self, max_mem_size, dims):
# self.memory = max_mem_size
self.state_memory = np.zeros((max_mem_size, *dims), dtype=np.float32) # Here I added "*" to deflate the no 2D observation (169, 31)
self.action_memory = np.zeros(max_mem_size, dtype=np.int32)
self.reward_memory = np.zeros(max_mem_size, dtype=np.float32)
self.new_state_memory = np.zeros((max_mem_size, *dims), dtype=np.float32) # Here I added "*" to deflate the no 2D observation (169, 31)
self.done_memory = np.zeros(max_mem_size, dtype=np.int32)
self.max_mem_size = max_mem_size
self.mem_counter = 0
self.index = 0
def store_transition(self, transition):
'''
:param transition: Tuple of transition data (state, action, reward, new_state, done)
:return: Nothing
'''
self.state_memory[self.index] = transition[0]
self.action_memory[self.index] = transition[1]
self.reward_memory[self.index] = transition[2]
self.new_state_memory[self.index] = transition[3]
self.done_memory[self.index] = transition[4]
self.mem_counter += 1
if self.index < self.max_mem_size - 1:
self.index += 1
else:
self.index = 0
def get_sample_batch(self, batch_size, replace=False):
'''
:param batch_size: Number of samples for batch
:param replace: Wether or not double entries are allowed in returned batch
:return: Tuples of transition data (state, action, reward, new_state, done)
'''
max_size = min(self.mem_counter, self.max_mem_size)
batch_ids = np.random.default_rng().choice(max_size, batch_size, replace)
states = self.state_memory[batch_ids]
actions = self.action_memory[batch_ids]
rewards = self.reward_memory[batch_ids]
new_states = self.new_state_memory[batch_ids]
dones = self.done_memory[batch_ids]
return states, actions, rewards, new_states, dones
class DuelingDeepQNAgent():
def __init__(self, lr, gamma, env, batch_size=64, mem_size=1_000_000, update_target_every=50):
self.n_actions = env.action_space.n
self.input_dims = env.observation_space.shape #env.observation_space.shape[0]
self.action_space = [i for i in range(self.n_actions)]
self.gamma = gamma
self.epsilon = 1.0
self.batch_size = batch_size
self.memory = ReplayBuffer(max_mem_size=mem_size, dims=self.input_dims)
self.update_target_every = update_target_every
self.update_target_counter = 0
self.learn_step_counter = 0
# Main model - gets trained every single step()
self.q_network = DuelingDeepQNetwork(n_actions=self.n_actions, neurons_1=256, neurons_2=256, neurons_3=128)
self.target_network = DuelingDeepQNetwork(n_actions=self.n_actions, neurons_1=256, neurons_2=256, neurons_3=128)
self.q_network.compile(optimizer=Adam(learning_rate=lr), loss='mse')
self.target_network.compile(optimizer=Adam(learning_rate=lr), loss='mse')
def store_transition(self, transition):
self.memory.store_transition(transition)
def choose_action(self, observation):
if np.random.random() < self.epsilon:
action = np.random.choice(self.action_space)
else:
state = np.array([observation]) # Add in an extra dimension -> quasi hinzufügen einer "batch-dimension"
q_values = self.q_network.advantage(state)
action = tf.math.argmax(q_values, axis=1).numpy()[0]
return action
def learn(self):
if self.memory.mem_counter < self.batch_size:
return
if self.update_target_counter % self.update_target_every == 0:
self.target_network.set_weights(self.q_network.get_weights())
current_states, actions, rewards, next_states, dones = self.memory.get_sample_batch(self.batch_size)
current_qs_list = self.q_network(current_states)
next_qs_list = self.target_network(next_states)
target_qs_list = current_qs_list.numpy() # ??? From Tensor to Numpy?!
max_actions = tf.math.argmax(self.q_network(next_states), axis=1)
# According to Phil: improve on my solution here....
for idx, done in enumerate(dones):
target_qs_list[idx, actions[idx]] = rewards[idx] + self.gamma * next_qs_list[idx, max_actions[idx]] * (1-int(dones[idx]))
self.q_network.fit(current_states, target_qs_list, batch_size=self.batch_size, verbose=0)
self.learn_step_counter += 1
You can't use Dense layers on 2D inputs (i.e. inputs that are 3D, with the batch dimension). It will only take the last dimension as its input. Dense layers work with 1D input tensors, provided as a 2D batch.
Either add a Flatten(shape=-1) layer to flatten your input (e.g. to a (batch_size, 169*3) tensor), or use a convolutional network.
Also your self.A layer appears to provide the q-value for each action, given a state. The advantage would be the action values - the value estimate. I think that's what you're calculating in call()
If the flatten doesn't fix your output shape, I'd double check the shapes of the arguments for Q, and the shape of the returned Q.
print(Q.shape) will be your friend for debugging!
I try to reproduce results generated by the LSTMCell from TensorFlow to be sure that I know what it does.
Here is my TensorFlow code:
num_units = 3
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)
timesteps = 7
num_input = 4
X = tf.placeholder("float", [None, timesteps, num_input])
x = tf.unstack(X, timesteps, 1)
outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
x_val = np.random.normal(size = (1, 7, num_input))
res = sess.run(outputs, feed_dict = {X:x_val})
for e in res:
print e
Here is its output:
[[-0.13285545 -0.13569424 -0.23993783]]
[[-0.04818152 0.05927373 0.2558436 ]]
[[-0.13818116 -0.13837864 -0.15348436]]
[[-0.232219 0.08512601 0.05254192]]
[[-0.20371495 -0.14795329 -0.2261929 ]]
[[-0.10371902 -0.0263292 -0.0914975 ]]
[[0.00286371 0.16377522 0.059478 ]]
And here is my own implementation:
n_steps, _ = X.shape
h = np.zeros(shape = self.hid_dim)
c = np.zeros(shape = self.hid_dim)
for i in range(n_steps):
x = X[i,:]
vec = np.concatenate([x, h])
#vec = np.concatenate([h, x])
gs = np.dot(vec, self.kernel) + self.bias
g1 = gs[0*self.hid_dim : 1*self.hid_dim]
g2 = gs[1*self.hid_dim : 2*self.hid_dim]
g3 = gs[2*self.hid_dim : 3*self.hid_dim]
g4 = gs[3*self.hid_dim : 4*self.hid_dim]
I = vsigmoid(g1)
N = np.tanh(g2)
F = vsigmoid(g3)
O = vsigmoid(g4)
c = c*F + I*N
h = O * np.tanh(c)
print h
And here is its output:
[-0.13285543 -0.13569425 -0.23993781]
[-0.01461723 0.08060743 0.30876374]
[-0.13142865 -0.14921292 -0.16898363]
[-0.09892188 0.11739943 0.08772941]
[-0.15569218 -0.15165766 -0.21918869]
[-0.0480604 -0.00918626 -0.06084118]
[0.0963612 0.1876516 0.11888081]
As you might notice I was able to reproduce the first hidden vector, but the second one and all the following ones are different. What am I missing?
i examined this link and your code is almost perfect but you forgot to add forget_bias value(default 1.0) in this line F = vsigmoid(g3) its actualy F = vsigmoid(g3+self.forget_bias) or in your case its 1 F = vsigmoid(g3+1)
here is my imp with numpy:
import numpy as np
import tensorflow as tf
num_units = 3
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units)
batch=1
timesteps = 7
num_input = 4
X = tf.placeholder("float", [batch, timesteps, num_input])
x = tf.unstack(X, timesteps, 1)
outputs, states = tf.contrib.rnn.static_rnn(lstm, x, dtype=tf.float32)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
x_val = np.reshape(range(28),[batch, timesteps, num_input])
res = sess.run(outputs, feed_dict = {X:x_val})
for e in res:
print(e)
print("\nmy imp\n")
#my impl
def sigmoid(x):
return 1/(1+np.exp(-x))
kernel,bias=sess.run([lstm._kernel,lstm._bias])
f_b_=lstm._forget_bias
c,h=np.zeros([batch,num_input-1]),np.zeros([batch,num_input-1])
for step in range(timesteps):
inpt=np.split(x_val,7,1)[step][0]
lstm_mtrx=np.matmul(np.concatenate([inpt,h],1),kernel)+bias
i,j,f,o=np.split(lstm_mtrx,4,1)
c=sigmoid(f+f_b_)*c+sigmoid(i)*np.tanh(j)
h=sigmoid(o)*np.tanh(c)
print(h)
output:
[[ 0.06964055 -0.06541953 -0.00682676]]
[[ 0.005264 -0.03234607 0.00014838]]
[[ 1.617855e-04 -1.316892e-02 8.596722e-06]]
[[ 3.9425286e-06 -5.1347450e-03 7.5078127e-08]]
[[ 8.7508155e-08 -1.9560163e-03 6.3853928e-10]]
[[ 1.8867894e-09 -7.3784427e-04 5.8551406e-12]]
[[ 4.0385355e-11 -2.7728223e-04 5.3957669e-14]]
my imp
[[ 0.06964057 -0.06541953 -0.00682676]]
[[ 0.005264 -0.03234607 0.00014838]]
[[ 1.61785520e-04 -1.31689185e-02 8.59672610e-06]]
[[ 3.94252745e-06 -5.13474567e-03 7.50781122e-08]]
[[ 8.75080644e-08 -1.95601574e-03 6.38539112e-10]]
[[ 1.88678843e-09 -7.37844070e-04 5.85513438e-12]]
[[ 4.03853841e-11 -2.77282006e-04 5.39576024e-14]]
Tensorflow uses glorot_uniform() function to initialize the lstm kernel, which samples weights from a random uniform distribution. We need to fix a value for the kernel to get reproducible results:
import tensorflow as tf
import numpy as np
np.random.seed(0)
timesteps = 7
num_input = 4
x_val = np.random.normal(size = (1, timesteps, num_input))
num_units = 3
def glorot_uniform(shape):
limit = np.sqrt(6.0 / (shape[0] + shape[1]))
return np.random.uniform(low=-limit, high=limit, size=shape)
kernel_init = glorot_uniform((num_input + num_units, 4 * num_units))
My implementation of the LSTMCell (well, actually it's just slightly rewritten tensorflow's code):
def sigmoid(x):
return 1. / (1 + np.exp(-x))
class LSTMCell():
"""Long short-term memory unit (LSTM) recurrent network cell.
"""
def __init__(self, num_units, initializer=glorot_uniform,
forget_bias=1.0, activation=np.tanh):
"""Initialize the parameters for an LSTM cell.
Args:
num_units: int, The number of units in the LSTM cell.
initializer: The initializer to use for the kernel matrix. Default: glorot_uniform
forget_bias: Biases of the forget gate are initialized by default to 1
in order to reduce the scale of forgetting at the beginning of
the training.
activation: Activation function of the inner states. Default: np.tanh.
"""
# Inputs must be 2-dimensional.
self._num_units = num_units
self._forget_bias = forget_bias
self._activation = activation
self._initializer = initializer
def build(self, inputs_shape):
input_depth = inputs_shape[-1]
h_depth = self._num_units
self._kernel = self._initializer(shape=(input_depth + h_depth, 4 * self._num_units))
self._bias = np.zeros(shape=(4 * self._num_units))
def call(self, inputs, state):
"""Run one step of LSTM.
Args:
inputs: input numpy array, must be 2-D, `[batch, input_size]`.
state: a tuple of numpy arrays, both `2-D`, with column sizes `c_state` and
`m_state`.
Returns:
A tuple containing:
- A `2-D, [batch, output_dim]`, numpy array representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is equal to num_units.
- Numpy array(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
"""
num_proj = self._num_units
(c_prev, m_prev) = state
input_size = inputs.shape[-1]
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
lstm_matrix = np.hstack([inputs, m_prev]).dot(self._kernel)
lstm_matrix += self._bias
i, j, f, o = np.split(lstm_matrix, indices_or_sections=4, axis=0)
# Diagonal connections
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
m = sigmoid(o) * self._activation(c)
new_state = (c, m)
return m, new_state
X = x_val.reshape(x_val.shape[1:])
cell = LSTMCell(num_units, initializer=lambda shape: kernel_init)
cell.build(X.shape)
state = (np.zeros(num_units), np.zeros(num_units))
for i in range(timesteps):
x = X[i,:]
output, state = cell.call(x, state)
print(output)
Produces output:
[-0.21386017 -0.08401277 -0.25431477]
[-0.22243588 -0.25817422 -0.1612211 ]
[-0.2282134 -0.14207162 -0.35017249]
[-0.23286737 -0.17129192 -0.2706512 ]
[-0.11768674 -0.20717363 -0.13339118]
[-0.0599215 -0.17756104 -0.2028935 ]
[ 0.11437953 -0.19484555 0.05371994]
While your Tensorflow code, if you replace the second line with
lstm = tf.nn.rnn_cell.LSTMCell(num_units = num_units, initializer = tf.constant_initializer(kernel_init))
returns:
[[-0.2138602 -0.08401276 -0.25431478]]
[[-0.22243595 -0.25817424 -0.16122109]]
[[-0.22821338 -0.1420716 -0.35017252]]
[[-0.23286738 -0.1712919 -0.27065122]]
[[-0.1176867 -0.2071736 -0.13339119]]
[[-0.05992149 -0.177561 -0.2028935 ]]
[[ 0.11437953 -0.19484554 0.05371996]]
Here is a blog which will answer any conceptual questions related to LSTM's. Seems that there is a lot which goes into building an LSTM from scratch!
Of course, this answer doesn't solve your question but just giving a direction.
Considering Linear Algebra, it's possible to exist a dimension mismatch in the matrix multiplication between I*N (red circle), affecting the output, given that n x m dot m x p will give you a n x p dimensional output.
So I am building an RNN from scratch using numpy just to get the hang of how they work internally. My backpropagation through time is here:
def backprop_through_time(self, X, Y):
assert(len(X.shape) == 3)
seq_length = Y.shape[1] if self.return_sequences else 1
_, (Z_states, States, Z_outs, Outs) = self.feed_forward(X, cache=True)
if not self.return_sequences:
Outs = Outs[:,-1,:]
# setup gradients
dLdU = np.zeros(self.U.shape)
dLdV = np.zeros(self.V.shape)
dLdW = np.zeros(self.W.shape)
dLdB_state = np.zeros(self.B_state.shape)
dLdB_out = np.zeros(self.B_out.shape)
dLdOuts = self.loss_function_prime(Outs, Y)
if not self.return_sequences:
# we need dLdOuts to have a seq_length dim at axis 1
dLdOuts = np.expand_dims(dLdOuts, axis=1)
for t in range(seq_length):
adjusted_t = seq_length-1 if not self.return_sequences else t
# print("adjusted_t {}".format(adjusted_t))
dOuts_tdZ_out = self.output_activation_function_prime(Z_outs[:,adjusted_t,:])
dLdZ_out = np.multiply(dLdOuts[:, adjusted_t, :], dOuts_tdZ_out)
# Z_state = dot(X_t, self.U) + dot(State_{t-1}, self.W) + self.B_state
# State_t = f(Z_state)
# Z_out = dot(State_t, self.V) + self.B_out
# Out_t = g(Z_out)
dLdV += np.dot(States[:,adjusted_t,:].T, dLdZ_out)
dLdB_out += np.sum(dLdZ_out, axis=0, keepdims=True)
dLdZ_state = np.multiply(np.dot(dLdZ_out, self.V.T),
self.hidden_activation_function_prime(Z_states[:,adjusted_t,:]))
for t_prev in range(max(0, adjusted_t-self.backprop_through_time_limit), adjusted_t+1)[::-1]:
dLdB_state += np.sum(dLdZ_state, axis=0, keepdims=True)
dLdW += np.dot(States[:,t_prev-1,:].T, dLdZ_state)
dLdU += np.dot(X[:,t_prev,:].T, dLdZ_state)
dLdZ_state = np.multiply(np.dot(dLdZ_state, self.W.T),
self.hidden_activation_function_prime(States[:,t_prev-1,:]))
return (dLdU, dLdV, dLdW), (dLdB_state, dLdB_out)
However I am still failing a gradient check for parameters `dLdU, dLdW, dLdB_state`. I have gone through the math about a dozen times now, and I cannot find what is wrong with my implementation.
I assume X and Y both are 3D matrices with X having shape: X.shape := (batch_size, seq_length, input_dim)
while Y having shape: Y.shape := (batch_size, seq_length, output_dim)
Caching the feed_forward operation, I am returning Z_states with shape Z_states.shape := (batch_size, seq_length, hidden_dim), Z_outs and Outs with shape Z_outs.shape, Outs.shape := (batch_size, seq_length, output_dim), and States as States.shape := (batch_size, seq_length+1, hidden_dim). States[:,-1,:] is the original zeros of shape States[:,-1,:].shape := (batch_size, hidden_dim) that the RNN state is initialized with. Could anyone help me?
EDIT
I found my answer. My math is right, but I was calling the wrong variable. When I update dLdZ_state in the 2nd inner loop (the backprop through time part), I am multiplying with self.hidden_activation_function_prime(States[:,t_prev-1,:]) This shoud instead be self.hidden_activation_function_prime(Z_states[:,t_prev-1,:])
Based on the LSTM code provided in the official Theano tutorial (http://deeplearning.net/tutorial/code/lstm.py), I changed the LSTM layer code (i.e. the functions lstm_layer() and param_init_lstm()) to perform a GRU instead.
The provided LSTM code trains well, but not the GRU I coded: the accuracy on the training set with the LSTM goes up to 1 (train cost = 0), while with the GRU it stagnates at 0.7 (train cost = 0.3).
Below is the code I use for the GRU. I kept the same function names as in tutorial, so that one can copy paste the code directly in it. What could explain the poor performance of the GRU?
import numpy as np
def param_init_lstm(options, params, prefix='lstm'):
"""
GRU
"""
W = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the input in the reset gate
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate
axis=1)
params[_p(prefix, 'W')] = W
U = np.concatenate([ortho_weight(options['dim_proj']), # Weight matrix for the previous hidden state in the reset gate
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate
axis=1)
params[_p(prefix, 'U')] = U
b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate
params[_p(prefix, 'b')] = b.astype(config.floatX)
return params
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
def _step(m_, x_, h_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate
u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate
U_h_t = _slice( tparams[_p(prefix, 'U')], 2, options['dim_proj'])
x_h_t = _slice( x_, 2, options['dim_proj'])
h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t)
h = (1. - u) * h_ + u * h_t_temp
h = m_[:,None] * h + (1. - m_)[:,None] * h_
return h
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]
The issue comes from the last line, return rval[0]: it should instead be return rval.
The LSTM code provided in the official Theano tutorial (http://deeplearning.net/tutorial/code/lstm.py) uses return rval[0] because outputs_info contains 2 elements:
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj),
tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]
In the GRU, outputs_info contains just one element:
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
and despite the brackets, it won't return a list of a list of Theano variables representing the outputs of scan, but directly a Theano variable.
The rval is then fed to a pooling layer (in this case, a mean pooling layer):
By taking only rval[0] in the GRU, since in the GRU code rval is a Theano variable and not a list of a Theano variables, you removed the part in the red rectangle:
which means you tried to perform the sentence classification just using the first word.
Another GRU implementation that can be plugged in the LSTM tutorial:
# weight initializer, normal by default
def norm_weight(nin, nout=None, scale=0.01, ortho=True):
if nout is None:
nout = nin
if nout == nin and ortho:
W = ortho_weight(nin)
else:
W = scale * numpy.random.randn(nin, nout)
return W.astype('float32')
def param_init_lstm(options, params, prefix='lstm'):
"""
GRU. Source: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session0/lm.py
"""
nin = options['dim_proj']
dim = options['dim_proj']
# embedding to gates transformation weights, biases
W = numpy.concatenate([norm_weight(nin, dim),
norm_weight(nin, dim)], axis=1)
params[_p(prefix, 'W')] = W
params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
# recurrent transformation weights for gates
U = numpy.concatenate([ortho_weight(dim),
ortho_weight(dim)], axis=1)
params[_p(prefix, 'U')] = U
# embedding to hidden state proposal weights, biases
Wx = norm_weight(nin, dim)
params[_p(prefix, 'Wx')] = Wx
params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
# recurrent transformation weights for hidden state proposal
Ux = ortho_weight(dim)
params[_p(prefix, 'Ux')] = Ux
return params
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = state_below.shape[0]
dim = tparams[_p(prefix, 'Ux')].shape[1]
if mask is None:
mask = tensor.alloc(1., state_below.shape[0], 1)
# utility function to slice a tensor
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n*dim:(n+1)*dim]
return _x[:, n*dim:(n+1)*dim]
# state_below is the input word embeddings
# input to the gates, concatenated
state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
tparams[_p(prefix, 'b')]
# input to compute the hidden state proposal
state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
tparams[_p(prefix, 'bx')]
# step function to be used by scan
# arguments | sequences |outputs-info| non-seqs
def _step_slice(m_, x_, xx_, h_, U, Ux):
preact = tensor.dot(h_, U)
preact += x_
# reset and update gates
r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
# compute the hidden state proposal
preactx = tensor.dot(h_, Ux)
preactx = preactx * r
preactx = preactx + xx_
# hidden state proposal
h = tensor.tanh(preactx)
# leaky integrate and obtain next hidden state
h = u * h_ + (1. - u) * h
h = m_[:, None] * h + (1. - m_)[:, None] * h_
return h
# prepare scan arguments
seqs = [mask, state_below_, state_belowx]
_step = _step_slice
shared_vars = [tparams[_p(prefix, 'U')],
tparams[_p(prefix, 'Ux')]]
init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0)
rval, updates = theano.scan(_step,
sequences=seqs,
outputs_info=[init_state],
non_sequences=shared_vars,
name=_p(prefix, '_layers'),
n_steps=nsteps,
strict=True)
return rval
As a side note, Keras fixed this issue as follows:
results, _ = theano.scan(
_step,
sequences=inputs,
outputs_info=[None] + initial_states,
go_backwards=go_backwards)
# deal with Theano API inconsistency
if type(results) is list:
outputs = results[0]
states = results[1:]
else:
outputs = results
states = []