Tensorflow operation on non-zero vectors - python

I have spent about two hours on this, but could not find the solution. The closes thing to what I need is probably this boolen mask, but I am still missing the next step.
My neural network wasn't learning so I started looking at every step it performs. And sure enough I found a problem. The problem lies in the fact that due to sparsity on my input layer I get too many bias terms propagated throughout. Uniqueness of my set up though is that the last time matrices will be zero matrices. Let me show you, I will first show a screenshot of my notebook and will then present the code.
screenshot:
I do not want bias terms added to where the whole time is a zeros matrix. I thought I could perhaps perform an op on the boolean mask filtered matrix?
Here is the code:
import tensorflow as tf
import numpy as np
dim = 4
# batch x time x events x dim
tensor = np.random.rand(1, 3, 4, dim)
zeros_last_time = np.zeros((4, dim))
tensor[0][2] = zeros_last_time
input_layer = tf.placeholder(tf.float64, shape=(None, None, 4, dim))
# These are supposed to perform operations on the non-zero times
Wn = tf.Variable(
tf.truncated_normal(dtype=dtype, shape=(dim,), mean=0, stddev=0.01),
name="Wn")
bn = tf.Variable(tf.truncated_normal(dtype=dtype, shape=(1,), mean=0,
stddev=0.01), name="bn")
# this is the op I want to be performed only on non-zero times
op = tf.einsum('bted,d->bte', input_layer, Wn) + bn
s = tf.Session()
glob_vars = tf.global_variables_initializer()
s.run(glob_vars)
# first let's see what the bias term is
s.run(bn, feed_dict={input_layer: tensor})
s.run(op, feed_dict={input_layer: tensor})
EDIT: So I believe tf.where is what I need.

Maybe a good solution can be the usage of tf.where to create a mask of zeros where the input is zero (in the last dimension) and is one otherwise.
Once we got this mask, we can just multiply it for the bias to get the result.
Here's my solution:
import tensorflow as tf
import numpy as np
dim = 4
# batch x time x events x dim
tensor = np.random.rand(1, 3, 4, dim)
zeros_last_time = np.zeros((4, dim))
tensor[0][2] = zeros_last_time
dtype = tf.float64
input_layer = tf.placeholder(tf.float64, shape=(None, None, 4, dim))
# These are supposed to perform operations on the non-zero times
Wn = tf.Variable(
tf.truncated_normal(dtype=dtype, shape=(dim,), mean=0, stddev=0.01),
name="Wn")
bn = tf.Variable(
tf.truncated_normal(dtype=dtype, shape=(1,), mean=0, stddev=0.01),
name="bn")
bias = bn * tf.cast(
tf.where(input_layer == tf.zeros(tf.shape(input_layer)[-1]),
tf.zeros(tf.shape(input_layer)[-1]),
tf.ones(tf.shape(input_layer)[-1])), dtype)
# this is the op I want to be performed only on non-zero times
op = tf.einsum('bted,d->bte', input_layer, Wn) + bias
s = tf.Session()
glob_vars = tf.global_variables_initializer()
s.run(glob_vars)
# first let's see what the bias term is
print(s.run(bn, feed_dict={input_layer: tensor}))
print(s.run(op, feed_dict={input_layer: tensor}))

I managed to get the right bias, but then noticed that the dimensions are messed up. So this is only a partial answer:
import tensorflow as tf
import numpy as np
dim = 4
# batch x time x events x dim
tensor = np.random.rand(1, 3, 4, dim)
zeros_last_time = np.zeros((4, dim))
tensor[0][2] = zeros_last_time
dtype = tf.float64
input_layer = tf.placeholder(dtype, shape=(None, None, 4, dim))
# These are supposed to perform operations on the non-zero times
Wn = tf.Variable(
tf.truncated_normal(dtype=dtype, shape=(dim,), mean=0, stddev=0.01),
name="Wn")
bn = tf.Variable(
tf.truncated_normal(dtype=dtype, shape=(1,), mean=0, stddev=0.01),
name="bn")
zeros = tf.equal(input_layer, tf.cast(tf.zeros(tf.shape(input_layer)[2:]),
tf.float64))
# bias
where_ = tf.where(zeros, tf.zeros(tf.shape(input_layer)),
tf.ones(tf.shape(input_layer)))
bias = bn * tf.cast(where_, tf.float64)
op = tf.einsum('bted,d->bte', input_layer, Wn) + bias # will fail
print(bias)
s = tf.Session()
glob_vars = tf.global_variables_initializer()
s.run(glob_vars)
s = tf.Session()
glob_vars = tf.global_variables_initializer()
s.run(glob_vars)
feed_dict = {input_layer: tensor}
s.run(bias, feed_dict)
and these two for bias do the job:
biases = tf.slice(biases, [0, 0, 0, 0], [1, 3, 1, 4])
squeezed_biases = tf.squeeze(biases)

Related

Reassign non-variable tensor in tensorflow

I have a requirement, that I want to use the updated value of x as an input to RNN. The below code snippet might illustrate you in detail.
x = tf.placeholder("float", shape=[None,1])
RNNcell = tf.nn.rnn_cell.BasicRNNCell(....)
outputs, _ = tf.dynamic_rnn(RNNCell, tf.reshape(x, [1,-1,1]))
x = outputs[-1] * (tf.Varaibles(...) * tf.Constants(...))
#Vlad answer is correct but since am new member cannot vote. The below code snippet is updated version of Vlads one with RNN cell.
x = tf.placeholder("float", shape=[None,1])
model = tf.nn.rnn_cell.BasicRNNCell(num_units=1, activation=None)
outputs, state = tf.nn.dynamic_rnn(model, tf.reshape(x, [-1,1, 1]), dtype=tf.float32)
# output1 = model.output
# output1 = outputs[-1]
output1 = outputs[:,-1,:]
# output1 = outputs
some_value = tf.constant([9.0], # <-- Some tensor the output will be multiplied by
dtype=tf.float32)
output1 *= some_value # <-- The output had been multiplied by `some_value`
# (with broadcasting in case of
# more than one input samples)
with tf.control_dependencies([output1]): # <-- Not necessary, but explicit control
output2, state2 = model(output1,state)
The example is more or less self-explanatory. We take the output of the model, multiply it by some tensor (could be scalar, or tensor with rank > 0 that could be broadcasted), feed it again to the model and get the result:
import tensorflow as tf
import numpy as np
x = tf.placeholder(tf.float32, shape=(None, 2))
w = tf.Variable(tf.random_normal([2, 2]))
bias = tf.Variable(tf.zeros((2, )))
output1 = tf.matmul(x, w) + bias
some_value = tf.constant([3, 3], # <-- Some tensor the output will be multiplied by
dtype=tf.float32)
output1 *= some_value*x # <-- The output had been multiplied by `some_value`
# (in this case with broadcasting in case of
# more than one input sample)
with tf.control_dependencies([output1]): # <-- Not necessary, but explicit control
output2 = tf.matmul(output1, w) + bias # dependencies is always good practice.
data = np.ones((3, 2)) # 3 two-dimensional samples
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(output2, feed_dict={x:data}))
# [[3.0432963 3.6584744]
# [3.0432963 3.6584744]
# [3.0432963 3.6584744]]

InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape

I have written the following code in my Pycharm which does Fully Connect Layer (FCL) in Tensorflow. The placeholder happens invalid argument error. So I entered all the dtype, shape, and name in the placeholder, but I still get invalid argument error.
I want to make new Signal(1, 222) through FCL model.
input Signal(1, 222) => output Signal(1, 222)
maxPredict: Find the index with the highest value in the output signal.
calculate Y: Get the frequency array value corresponding to maxPredict.
loss: Use the difference between true Y and calculate Y as a loss.
loss = tf.abs(trueY - calculateY)`
Code (occur Error)
x = tf.placeholder(dtype=tf.float32, shape=[1, 222], name='inputX')
ERROR
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'inputX' with dtype float and shape [1,222]
tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor 'inputX' with dtype float and shape [1,222]
[[{{node inputX}} = Placeholderdtype=DT_FLOAT, shape=[1,222], _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
During handling of the above exception, another exception occurred:
New Error Case
I changed my Code.
x = tf.placeholder(tf.float32, [None, 222], name='inputX')
Error Case 1
tensorFreq = tf.convert_to_tensor(basicFreq, tf.float32)
newY = tf.gather(tensorFreq, maxPredict) * 60
loss = tf.abs(y - tf.Variable(newY))
ValueError: initial_value must have a shape specified: Tensor("mul:0", shape=(?,), dtype=float32)
Error Case 2
tensorFreq = tf.convert_to_tensor(basicFreq, tf.float32)
newY = tf.gather(tensorFreq, maxPredict) * 60
loss = tf.abs(y - newY)
Traceback (most recent call last):
File "D:/PycharmProject/DetectionSignal/TEST_FCL_StackOverflow.py", line 127, in
trainStep = opt.minimize(loss)
File "C:\Users\Heewony\Anaconda3\envs\TSFW_pycharm\lib\site-packages\tensorflow\python\training\optimizer.py", line 407, in minimize
([str(v) for _, v in grads_and_vars], loss))
ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables [tf.Variable 'Variable:0' shape=(222, 1024) dtype=float32_ref, tf.Variable 'Variable_1:0' shape=(1024,) dtype=float32_re, ......... tf.Variable 'Variable_5:0' shape=(222,) dtype=float32_ref] and loss Tensor("Abs:0", dtype=float32).
Development environment
OS Platform and Distribution: Windows 10 x64
TensorFlow installed from: Anaconda
Tensorflow version 1.12.0:
python 3.6.7 :
Mobile device: N/A
Exact command to reproduce: N/A
GPU model and memory: NVIDIA GeForce CTX 1080 Ti
CUDA/cuDNN: 9.0/7.4
Model and Function
def Model_FCL(inputX):
data = inputX # input Signals
# Fully Connected Layer 1
flatConvh1 = tf.reshape(data, [-1, 222])
fcW1 = tf.Variable(tf.truncated_normal(shape=[222, 1024], stddev=0.05))
fcb1 = tf.Variable(tf.constant(0.1, shape=[1024]))
fch1 = tf.nn.relu(tf.matmul(flatConvh1, fcW1) + fcb1)
# Fully Connected Layer 2
flatConvh2 = tf.reshape(fch1, [-1, 1024])
fcW2 = tf.Variable(tf.truncated_normal(shape=[1024, 1024], stddev=0.05))
fcb2 = tf.Variable(tf.constant(0.1, shape=[1024]))
fch2 = tf.nn.relu(tf.matmul(flatConvh2, fcW2) + fcb2)
# Output Layer
fcW3 = tf.Variable(tf.truncated_normal(shape=[1024, 222], stddev=0.05))
fcb3 = tf.Variable(tf.constant(0.1, shape=[222]))
logits = tf.add(tf.matmul(fch2, fcW3), fcb3)
predictY = tf.nn.softmax(logits)
return predictY, logits
def loadMatlabData(fileName):
contentsMat = sio.loadmat(fileName)
dataInput = contentsMat['dataInput']
dataLabel = contentsMat['dataLabel']
dataSize = dataInput.shape
dataSize = dataSize[0]
return dataInput, dataLabel, dataSize
def getNextSignal(num, data, labels, WINDOW_SIZE, OUTPUT_SIZE):
shuffleSignal = data[num]
shuffleLabels = labels[num]
# shuffleSignal = shuffleSignal.reshape(1, WINDOW_SIZE)
# shuffleSignal = np.asarray(shuffleSignal, np.float32)
return shuffleSignal, shuffleLabels
def getBasicFrequency():
# basicFreq => shape(222)
basicFreq = np.array([0.598436736688, 0.610649731314, ... 3.297508549096])
return basicFreq
Graph
basicFreq = getBasicFrequency()
myGraph = tf.Graph()
with myGraph.as_default():
# define input data & output data 입력받기 위한 placeholder
x = tf.placeholder(dtype=tf.float32, shape=[1, 222], name='inputX') # Signal size = [1, 222]
y = tf.placeholder(tf.float32, name='trueY') # Float value size = [1]
print('inputzz ', x, y)
print('Graph ', myGraph.get_operations())
print('TrainVariable ', tf.trainable_variables())
predictY, logits = Model_FCL(x) # Predict Signal, size = [1, 222]
maxPredict = tf.argmax(predictY, 1, name='maxPredict') # Find max index of Predict Signal
tensorFreq = tf.convert_to_tensor(basicFreq, tf.float32)
newY = tf.gather(tensorFreq, maxPredict) * 60 # Find the value that corresponds to the Freq array index
loss = tf.abs(y - tf.Variable(newY)) # Calculate absolute (true Y - predict Y)
opt = tf.train.AdamOptimizer(learning_rate=0.0001)
trainStep = opt.minimize(loss)
print('Graph ', myGraph.get_operations())
print('TrainVariable ', tf.trainable_variables())
Session
with tf.Session(graph=myGraph) as sess:
sess.run(tf.global_variables_initializer())
dataFolder = './'
writer = tf.summary.FileWriter('./logMyGraph', sess.graph)
startTime = datetime.datetime.now()
numberSummary = 0
accuracyTotalTrain = []
for trainEpoch in range(1, 25 + 1):
arrayTrain = []
dataPPG, dataLabel, dataSize = loadMatlabData(dataFolder + "TestValues.mat")
for i in range(dataSize):
batchSignal, valueTrue = getNextSignal(i, dataPPG, dataLabel, 222, 222)
_, lossPrint, valuePredict = sess.run([trainStep, loss, newY], feed_dict={x: batchSignal, y: valueTrue})
print('Train ', i, ' ', valueTrue, ' - ', valuePredict, ' Loss ', lossPrint)
arrayTrain.append(lossPrint)
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='Loss', simple_value=float(lossPrint))]),
numberSummary)
numberSummary += 1
accuracyTotalTrain.append(np.mean(arrayTrain))
print('Final Train : ', accuracyTotalTrain)
sess.close()
It seems that the variable batchSignal is of a wrong type or shape. It must be a numpy array of shape exactly [1, 222]. If you want to use a batch of examples of size n × 222, the placeholder x should have a shape of [None, 222] and placeholder y shape [None].
By the way, consider using tf.layers.dense instead of explicitly initializing variables and implementing the layers yourself.
There should be two things to change.
Error Case 0. You don't need to reshape your flow between layers. You can use None at the first dimension to pass a dynamic batch size.
Error Case 1. You can use directly your newY as output of the NN. You only use tf.Variable to define weights or bias.
Error Case 2. And it seems that tensorflow doesn't have gradient descent implementation for neither tf.abs() nor tf.gather(). With a regression problem, the mean square error is often sufficient.
Herein, how I rewrite your code. I don't have your matlab part so I can't debug your python/matlab interface:
Model:
def Model_FCL(inputX):
# Fully Connected Layer 1
fcW1 = tf.get_variable('w1', shape=[222, 1024], initializer=tf.initializer.truncated_normal())
fcb1 = tf.get_variable('b1', shape=[222], initializer=tf.initializer.truncated_normal())
# fcb1 = tf.get_variable('b1', shape=[None, 222], trainable=False, initializer=tf.constant_initializer(valueThatYouWant)) # if you want to fix your bias constant
fch1 = tf.nn.relu(tf.matmul(inputX, fcW1) + fcb1, name='relu1')
# Fully Connected Layer 2
fcW2 = tf.get_variable('w2', shape=[1024, 1024], initializer=tf.initializer.truncated_normal())
fcb2 = tf.get_variable('b2', shape=[222], initializer=tf.initializer.truncated_normal())
# fcb2 = tf.get_variable('b2', shape=[None, 222], trainable=False, initializer=tf.constant_initializer(valueThatYouWant)) # if you want to fix your bias constant
fch2 = tf.nn.relu(tf.matmul(fch1, fcW2) + fcb2, name='relu2')
# Output Layer
fcW3 = tf.get_variable('w3', shape=[1024, 222], initializer=tf.initializer.truncated_normal())
fcb3 = tf.get_variable('b3', shape=[222], initializer=tf.initializer.truncated_normal())
# fcb2 = tf.get_variable('b2', shape=[None, 222], trainable=False, initializer=tf.constant_initializer(valueThatYouWant)) # if you want to fix your bias constant
logits = tf.add(tf.matmul(fch2, fcW3), fcb3)
predictY = tf.nn.softmax(logits) #I'm not sure that it will learn if you do softmax then abs/MSE
return predictY, logits
Graph:
with myGraph.as_default():
# define input data & output data 입력받기 위한 placeholder
# put None(dynamic batch size) not -1 at the first dimension so that you can change your batch size
x = tf.placeholder(tf.float32, shape=[None, 222], name='inputX') # Signal size = [1, 222]
y = tf.placeholder(tf.float32, shape=[None], name='trueY') # Float value size = [1]
...
predictY, logits = Model_FCL(x) # Predict Signal, size = [1, 222]
maxPredict = tf.argmax(predictY, 1, name='maxPredict') # Find max index of Predict Signal
tensorFreq = tf.convert_to_tensor(basicFreq, tf.float32)
newY = tf.gather(tensorFreq, maxPredict) * 60 # Find the value that corresponds to the Freq array index
loss = tf.losses.mean_squared_error(labels=y, predictions=newY) # maybe use MSE for regression problem
# loss = tf.abs(y - newY) # Calculate absolute (true Y - predict Y) #tensorflow doesn't have gradient descent implementation for tf.abs
opt = tf.train.AdamOptimizer(learning_rate=0.0001)
trainStep = opt.minimize(loss)
If you are still getting the same error even after feeding the right numpy shape and also maintaining the correct dtypes (np.int32 or np.float32) as suggested by the error message, then the following code should solve your problem:
#this code will print the list of placeholders and other variables declared in the memory which is causing your error
[n.name for n in tf.get_default_graph().as_graph_def().node]
#it will reset your declared placeholders so you can start over
tf.reset_default_graph()
This problem could also be solved by restarting the kernel repeatedly for each debug however it's not feasible.

tf.reshape is not working in the cases where you are adding an extra dimension

According to the tensorflow website, tf.reshape takes a tensor of a certain shape and maps it to a tensor of another shape. I want to map a tensor of size [600, 64] to a tensor of size [-1, 8, 8, 1] (in which the dimension at the -1 position is 600). This doesn't seem to be working though.
I am running this on tensorflow on python 3.6 and although it reshapes to something like [-1, 8, 8], it doesn't reshape to [-1, 8, 8, 1]
import tensorflow as tf
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import LabelBinarizer
# preprocessing method needed
def flatten(array):
temp = []
for j in array:
temp.extend(j)
return temp
# preprocess the data
digits = datasets.load_digits()
images = digits.images
images = [flatten(i) for i in images]
labels = digits.target
labels = LabelBinarizer().fit_transform(labels)
# the stats needed
width = 8
height = 8
alpha = 0.1
num_labels = 10
kernel_length = 3
batch_size = 10
channels = 1
# the tensorflow placeholders and reshaping
X = tf.placeholder(tf.float32, shape = [None, width * height * channels])
# AND NOW HERE IS WHERE THE ERROR STARTS
y_true = tf.placeholder(tf.float32, shape = [None, num_labels])
X = tf.reshape(X, [-1, 8, 8, 1])
# the convolutional model
conv1 = tf.layers.conv2d(X, filters = 32, kernel_size = [kernel_length, kernel_length])
conv2 = tf.layers.conv2d(conv1, filters = 64, kernel_size = [2, 2])
flatten = tf.reshape(X, [-1, 1])
dense1 = tf.layers.dense(flatten, units=50, activation = tf.nn.relu)
y_pred = tf.layers.dense(dense1, units=num_labels, activation = tf.nn.softmax)
# the loss and training functions
loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred)
train = tf.train.GradientDescentOptimizer(alpha).minimize(loss)
# initializing the variables and the tf.session
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# running the session
for i in range(batch_size):
_, lossVal = sess.run((train, loss), feed_dict = {X:images[:600], y_true: labels[:600]})
print(lossVal)
I keep on getting this error:
ValueError: Cannot feed value of shape (600, 64) for Tensor 'Reshape:0', which has shape '(?, 8, 8, 1)'
And I feel like that should not be the case since 8 * 8 * 1 does equal 64.
images[:600]'s shape is (600, 64), which does not correspond to the placeholder expected shape, (None, 8, 8, 1).
Either reshape your data or change the shape of the placeholder.
Note that the fact that you originally defined the placeholder shape to be (None, 64) is inconsequential as you reshape it a few lines later.

mean squared error greater zero even though input equals output and weights initialized by one

I try to train a network where input = output, or more mathematical: f(x) = x <==> y = x <==> x = x
Therefore I have a input of size; [1, 500, 500, 3] and my network looks like this:
logits = tf.layers.conv2d(inputs=x, filters=3, kernel_size=1, padding='SAME', name ='logits', kernel_initializer=tf.ones_initializer(), trainable=True)
Then I calculate the loss:
loss = tf.losses.mean_squared_error(x, logits) > 0
but the network does not show the loss of 0.
In my opinion I thought if I set x = y the loss should be 0 when initializing the weights with ones.
Is there something wrong in my mind?
The conv2d operation sums over the channels of your input. Thus, initialising the kernel with ones will give you identical output channels, each of which is the sum over your input channels.
Note that unless you set the bias_initializer to zero or set the use_bias parameter to False, the randomly-initialised bias will also affect your output.
import tensorflow as tf
import numpy as np
with tf.Graph().as_default():
x = tf.placeholder(tf.float32, [None, 500, 500, 3])
y = tf.layers.conv2d(x, 3, 1, padding='same', use_bias=False, kernel_initializer=tf.ones_initializer())
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# Evaluate
np.random.seed(1)
_x = np.random.uniform(0, 1, (1, 500, 500, 3))
_y = sess.run(y, {x: _x})
# Check that the channels are identical
np.testing.assert_allclose(_y[..., 0], _y[..., 1])
# Check that each channel is the sum over channels
np.testing.assert_allclose(_y[..., 0], _x.sum(axis=-1), rtol=1e-5)

scheduled sampling in Tensorflow

The newest Tensorflow api about seq2seq model has included scheduled sampling:
https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper
https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledOutputTrainingHelper
The original paper of scheduled sampling can be found here:
https://arxiv.org/abs/1506.03099
I read the paper but I cannot understand the difference between ScheduledEmbeddingTrainingHelper and ScheduledOutputTrainingHelper. The documentation only says ScheduledEmbeddingTrainingHelper is a training helper that adds scheduled sampling while ScheduledOutputTrainingHelper is a training helper that adds scheduled sampling directly to outputs.
I wonder what's the difference between these two helpers?
I contacted the engineer behind this, and he responded:
The output sampler either emits the raw rnn output or the raw ground truth at that time step. The embedding sampler treats the rnn output as logits of a distribution and either emits the embedding lookup of a sampled id from that categorical distribution or the raw ground truth at that time step.
Here's a basic example of using ScheduledEmbeddingTrainingHelper, using TensorFlow 1.3 and some higher level tf.contrib APIs. It's a sequence2sequence model, where the decoder's initial hidden state is the final hidden state of the encoder. It shows only how to train on a single batch (and apparently the task is "reverse this sequence"). For actual training tasks, I suggest looking at tf.contrib.learn APIs such as learn_runner, Experiment and tf.estimator.Estimator.
import tensorflow as tf
import numpy as np
from tensorflow.python.layers.core import Dense
vocab_size = 7
embedding_size = 5
lstm_units = 10
src_batch = np.array([[1, 2, 3], [4, 5, 6]])
trg_batch = np.array([[3, 2, 1], [6, 5, 4]])
# *_seq will have shape (2, 3), *_seq_len will have shape (2)
source_seq = tf.placeholder(shape=(None, None), dtype=tf.int32)
target_seq = tf.placeholder(shape=(None, None), dtype=tf.int32)
source_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
target_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
# add Start of Sequence (SOS) tokens to each sequence
batch_size, sequence_size = tf.unstack(tf.shape(target_seq))
sos_slice = tf.zeros([batch_size, 1], dtype=tf.int32) # 0 = start of sentence token
decoder_input = tf.concat([sos_slice, target_seq], axis=1)
embedding_matrix = tf.get_variable(
name="embedding_matrix",
shape=[vocab_size, embedding_size],
dtype=tf.float32)
source_seq_embedded = tf.nn.embedding_lookup(embedding_matrix, source_seq) # shape=(2, 3, 5)
decoder_input_embedded = tf.nn.embedding_lookup(embedding_matrix, decoder_input) # shape=(2, 4, 5)
unused_encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
tf.contrib.rnn.LSTMCell(lstm_units),
source_seq_embedded,
sequence_length=source_seq_len,
dtype=tf.float32)
# Decoder:
# At each time step t and for each sequence in the batch, we get x_t by either
# (1) sampling from the distribution output_layer(t-1), or
# (2) reading from decoder_input_embedded.
# We do (1) with probability sampling_probability and (2) with 1 - sampling_probability.
# Using sampling_probability=0.0 is equivalent to using TrainingHelper (no sampling).
# Using sampling_probability=1.0 is equivalent to doing inference,
# where we don't supervise the decoder at all: output at t-1 is the input at t.
sampling_prob = tf.Variable(0.0, dtype=tf.float32)
helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
decoder_input_embedded,
target_seq_len,
embedding_matrix,
sampling_probability=sampling_prob)
output_layer = Dense(vocab_size)
decoder = tf.contrib.seq2seq.BasicDecoder(
tf.contrib.rnn.LSTMCell(lstm_units),
helper,
encoder_state,
output_layer=output_layer)
outputs, state, seq_len = tf.contrib.seq2seq.dynamic_decode(decoder)
loss = tf.contrib.seq2seq.sequence_loss(
logits=outputs.rnn_output,
targets=target_seq,
weights=tf.ones(trg_batch.shape))
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
optimizer=tf.train.AdamOptimizer,
learning_rate=0.001)
with tf.Session() as session:
session.run(tf.global_variables_initializer())
_, _loss = session.run([train_op, loss], {
source_seq: src_batch,
target_seq: trg_batch,
source_seq_len: [3, 3],
target_seq_len: [3, 3],
sampling_prob: 0.5
})
print("Loss: " + str(_loss))
For ScheduledOutputTrainingHelper, I would expect to just swap out the helper and use:
helper = tf.contrib.seq2seq.ScheduledOutputTrainingHelper(
target_seq,
target_seq_len,
sampling_probability=sampling_prob)
However this gives an error, since the LSTM cell expects a multidimensional input per timestep (of shape (batch_size, input_dims)). I will raise an issue in GitHub to find out if this is a bug, or there's some other way to use ScheduledOutputTrainingHelper.
This might also help you. This is for the case where you want to do scheduled sampling at each decoding step separately.
import tensorflow as tf
import numpy as np
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gen_array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops.distributions import categorical
from tensorflow.python.ops.distributions import bernoulli
batch_size = 64
vocab_size = 50000
emb_dim = 128
output = tf.get_variable('output',
initializer=tf.constant(np.random.rand(batch_size,vocab_size)))
base_next_inputs = tf.get_variable('input',
initializer=tf.constant(np.random.rand(batch_size,emb_dim)))
embedding = tf.get_variable('embedding',
initializer=tf.constant(np.random.rand(vocab_size,emb_dim)))
select_sampler = bernoulli.Bernoulli(probs=0.99, dtype=tf.bool)
select_sample = select_sampler.sample(sample_shape=batch_size,
seed=123)
sample_id_sampler = categorical.Categorical(logits=output)
sample_ids = array_ops.where(
select_sample,
sample_id_sampler.sample(seed=123),
gen_array_ops.fill([batch_size], -1))
where_sampling = math_ops.cast(
array_ops.where(sample_ids > -1), tf.int32)
where_not_sampling = math_ops.cast(
array_ops.where(sample_ids <= -1), tf.int32)
sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling)
inputs_not_sampling = array_ops.gather_nd(base_next_inputs,
where_not_sampling)
sampled_next_inputs = tf.nn.embedding_lookup(embedding,
sample_ids_sampling)
base_shape = array_ops.shape(base_next_inputs)
result1 = array_ops.scatter_nd(indices=where_sampling,
updates=sampled_next_inputs, shape=base_shape)
result2 = array_ops.scatter_nd(indices=where_not_sampling,
updates=inputs_not_sampling, shape=base_shape)
result = result1 + result2
I used the tensorflow documentation code to make this example.
https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/contrib/seq2seq/python/ops/helper.py

Categories

Resources