I am trying to create an embedding for 1,000,000 words on tensorflow. Each word will have a 256 float32 vector representing the word. The issue is that I keep running out of memory. This does not make sence to me since I have 8GB of memory on my GTX 1080. The embedding should only take up 1e6 * 256 * 4 = 1 Gb of memory. I also have another matrix on the output which is of the same size. Other than that there are a few other tensors which should be small in comparison. Therefore I only see about 2 - 3 GB worth of memory needed to store the model and it is failing when I call sess.run(tf.initialize_all_variables()). Where is all my memory going and do you have any advice for how I can get around this?
import tensorflow as tf
import nltk
import numpy as np
import os
import multiprocessing
import itertools
import pickle
from unidecode import unidecode
BATCH_SIZE = 32
TIME_STEPS = 64
WORD_VEC_SIZE = 256
words, training_data = pickle.load(open('vocab.pickle', 'rb'))
word2index = {w:i for i, w in enumerate(words)}
index2word = {i:w for i, w in enumerate(words)}
input_tensor = tf.placeholder(tf.int32, (BATCH_SIZE, TIME_STEPS + 1), 'input_tensor')
embedding = tf.Variable(tf.random_uniform((len(words), WORD_VEC_SIZE), -1, 1), name = 'embedding')
rnn = tf.nn.rnn_cell.BasicRNNCell(WORD_VEC_SIZE)
state = tf.zeros((BATCH_SIZE, rnn.state_size))
input_vectors = tf.nn.embedding_lookup([embedding], input_tensor[:, :TIME_STEPS])
cost = 0
with tf.variable_scope('rnn') as scope:
W_out = tf.get_variable('W_out', (WORD_VEC_SIZE, len(words)), initializer = tf.truncated_normal_initializer(0.0, 1 / np.sqrt(WORD_VEC_SIZE)))
b_out = tf.get_variable('b_out', (len(words), ), initializer = tf.truncated_normal_initializer(0.0, 0.01))
for t in range(TIME_STEPS):
y, state = rnn(tf.reshape(input_vectors[:, t, :], (-1, WORD_VEC_SIZE)), state)
cost += tf.reduce_mean(tf.nn.sampled_softmax_loss(W_out, b_out, y, tf.reshape(input_tensor[:, t + 1], (-1, 1)), 1000, len(words)))
scope.reuse_variables()
train_step = tf.train.AdamOptimizer(1e-4).minimize(cost)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
saver = tf.train.Saver()
What I was not accounting for was the AdamOptimizer. I forgot that this needs to store various parameters for each of the weights in my model. When I changed to a GraidentDecent optimizer it now fits on my GPU.
Related
I have the following code that uses tensorflow to calculate a custom average loss when the image is consistently rotated:
import tensorflow as tf
import cv2
#initialize x_hat
img = cv2.imread("4.jpg")
x_hat = tf.Variable(img,name = 'x_hat') #img we want to attack
#tf.function
def cost2():
image=x_hat
#Now it will generate 100 samples rotated
num_samples = 100
average_loss = 0
for j in range(num_samples):
#ADD ROTATION (there may be a problem here)
rotated = tf.keras.preprocessing.image.random_rotation(image,
tf.random.uniform(shape=(),minval=40, maxval=90),channel_axis=2)
#get logits
rotated_logits, _ = resnet(rotated)
#get average CUSTOM loss
average_loss+=-1 * tf.nn.softmax_cross_entropy_with_logits(logits=rotated_logits, labels=labels)/ num_samples
return average_loss
and here is how I call it
learning_rate = 1e-1
optim = tf.optimizers.SGD (learning_rate=learning_rate)
epsilon = 2.0/255.0 # a really small perturbation
below = x - epsilon
above = x + epsilon
demo_steps = 200
# projected gradient descent
for i in range(demo_steps):
loss = optim.minimize(cost2, var_list=[x_hat])
if (i+1) % 10 == 0:
print('step %d, loss=%g' % (i+1, loss.numpy()))
projected = tf.clip_by_value(tf.clip_by_value(x_hat, below, above), 0, 1)
with tf.control_dependencies([projected]):
x_hat.assign(projected)
adv_robust = x_hat.numpy()
However, the following error returns to me once I run the code:
TypeError: in user code:
<ipython-input-183-abde02909da7>:14 cost2 *
rotated = tf.keras.preprocessing.image.random_rotation(image,
tf.random.uniform(shape=(),minval=40, maxval=90),channel_axis=2)
/home/me/.local/lib/python3.8/site-
packages/keras_preprocessing/image/affine_transformations.py:55 random_rotation *
theta = np.random.uniform(-rg, rg)
mtrand.pyx:1111 numpy.random.mtrand.RandomState.uniform **
TypeError: __array__() takes 1 positional argument but 2 were given
I am on Tensorflow 2.4.0 and the random_rotation and random.uniform functions are correct according to the TF 2.4.0 documentation HERE and HERE. So, what am I missing here?
The error might be coming from using TF tensors. As stated in the docs you linked regarding random_rotation:
Performs a random rotation of a Numpy image tensor.
Meaning you cannot use TF tensors with this operation. If you are in eager execution mode you can use tensor.numpy():
import tensorflow as tf
image = tf.random.normal((180, 180, 3))
rotated = tf.keras.preprocessing.image.random_rotation(image.numpy(),
tf.random.uniform(shape=(),minval=40, maxval=90).numpy(),channel_axis=2)
Otherwise, it is recommended to use the preprocessing layer: tf.keras.layers.RandomRotation, since using numpy in graph mode (for example in a function decorated with #tf.function) is not recommended.
Here is an example using the tf.keras.layers.RandomRotation:
import tensorflow as tf
import os
import matplotlib.pyplot as plt
_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
path_to_zip = tf.keras.utils.get_file('cats_and_dogs.zip', origin=_URL, extract=True)
PATH = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered')
train_dir = os.path.join(PATH, 'train')
validation_dir = os.path.join(PATH, 'validation')
BATCH_SIZE = 1
IMG_SIZE = (160, 160)
train_ds = tf.keras.utils.image_dataset_from_directory(train_dir,
shuffle=True,
batch_size=BATCH_SIZE,
image_size=IMG_SIZE)
data_augmentation = tf.keras.Sequential([
tf.keras.layers.RandomRotation(tf.random.uniform(shape=(),minval=40, maxval=90)),
])
for image, _ in train_ds.take(1):
plt.figure(figsize=(10, 10))
first_image = image[0]
for i in range(9):
ax = plt.subplot(3, 3, i + 1)
augmented_image = data_augmentation(tf.expand_dims(first_image, 0), training=True)
plt.imshow(augmented_image[0] / 255)
plt.axis('off')
I just started using the GPU version of TensorFlow hoping that it would speed up the training of my feed-forward neural networks. I am able to train on my GPU (GTX1080ti), but unfortunately it is not notably faster than doing the same training on my CPU (i7-8700K) the current way I’ve implemented it. During training, the GPU appears to barely be utilized at all, which makes me suspect that the bottleneck in my implementation is how the data is copied from the host to the device using feed_dict.
I’ve heard that TensorFlow has something called the “tf.data” pipeline which is supposed to make it easier and faster to feed data to GPUs etc. However I have not been able to find any simple examples where this concept is implemented into multilayer perceptron training as a replacement for feed_dict.
Is anyone aware of such an example and can point me to it? Preferably as simple as possible since I’m new to TensorFlow in general. Or is there something else I should change in my current implementation to make it more efficient? I’m pasting the code I have here:
import tensorflow as tf
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
tf.reset_default_graph()
import time
# Function for iris dataset.
def get_iris_data():
iris = datasets.load_iris()
data = iris["data"]
target = iris["target"]
# Convert to one-hot vectors
num_labels = len(np.unique(target))
all_Y = np.eye(num_labels)[target]
return train_test_split(data, all_Y, test_size=0.33, random_state=89)
# Function which initializes tensorflow weights & biases for feed-forward NN.
def InitWeights(LayerSizes):
with tf.device('/gpu:0'):
# Make tf placeholders for network inputs and outputs.
X = tf.placeholder( shape = (None,LayerSizes[0]),
dtype = tf.float32,
name ='InputData')
y = tf.placeholder( shape = (None,LayerSizes[-1]),
dtype = tf.float32,
name ='OutputData')
# Initialize weights and biases.
W = {}; b = {};
for ii in range(len(LayerSizes)-1):
layername = f'layer%s' % ii
with tf.variable_scope(layername):
ny = LayerSizes[ii]
nx = LayerSizes[ii+1]
# Weights (initialized with xavier initializatiion).
W['Weights_'+layername] = tf.get_variable(
name = 'Weights_'+layername,
shape = (ny, nx),
initializer = tf.contrib.layers.xavier_initializer(),
dtype = tf.float32
)
# Bias (initialized with xavier initializatiion).
b['Bias_'+layername] = tf.get_variable(
name = 'Bias_'+layername,
shape = (nx),
initializer = tf.contrib.layers.xavier_initializer(),
dtype = tf.float32
)
return W, b, X, y
# Function for forward propagation of NN.
def FeedForward(X, W, b):
with tf.device('/gpu:0'):
# Initialize 'a' of first layer to the placeholder of the network input.
a = X
# Loop all layers of the network.
for ii in range(len(W)):
# Use name of each layer as index.
layername = f'layer%s' % ii
## Weighted sum: z = input*W + b
z = tf.add(tf.matmul(a, W['Weights_'+layername], name = 'WeightedSum_z_'+layername), b['Bias_'+layername])
## Passed through actication fcn: a = h(z)
if ii == len(W)-1:
a = z
else:
a = tf.nn.relu(z, name = 'activation_a_'+layername)
return a
if __name__ == "__main__":
# Import data
train_X, test_X, train_y, test_y = get_iris_data()
# Define network size [ninputs-by-256-by-outputs]
LayerSizes = [4, 256, 3]
# Initialize weights and biases.
W, b, X, y = InitWeights(LayerSizes)
# Define loss function to optimize.
yhat = FeedForward(X, W, b)
loss = tf.reduce_sum(tf.square(y - yhat),reduction_indices=[0])
# Define optimizer to use when minimizing loss function.
all_variables = tf.trainable_variables()
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.0001)
train_op = optimizer.minimize(loss, var_list = all_variables)
# Start tf session and initialize variables.
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# Train 10000 minibatches and time how long it takes.
t0 = time.time()
for i in range(10000):
ObservationsToUse = np.random.choice(len(train_X), 32)
X_minibatch = train_X[ObservationsToUse,:]
y_minibatch = train_y[ObservationsToUse,:]
sess.run(train_op, feed_dict={X : X_minibatch, y : y_minibatch})
t1 = time.time()
print('Training took %0.2f seconds' %(t1-t0))
sess.close()
The speed might be low because:
You are creating placeholders. Using numpy, we insert the data in the
placeholders and thereby they are converted to tensors of the graph.
By using tf.data.Dataset, you can create a direct pipeline which makes the data directly flow into the graph without the need of placeholders. They are fast, scalable and have a number of functions to play around with.
with np.load("/var/data/training_data.npy") as data:
features = data["features"]
labels = data["labels"]
# Assume that each row of `features` corresponds to the same row as `labels`.
assert features.shape[0] == labels.shape[0]
dataset = tf.data.Dataset.from_tensor_slices((features, labels))
Some useful functions :
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(32) # Creating batches
dataset = dataset.repeat(num_epochs) # repeat the dataset 'N' times
iterator = dataset.make_one_shot_iterator() # Create a iterator to retrieve batches of data
X, Y = iterator.get_next()
Here, 32 is the batch size.
In your case,
dataset = tf.data.Dataset.from_tensor_slices((data, targets))
Hence, there is no need of placeholders. Directly run,
session.run( train_op ) # no feed_dict!!
Recently,I am studying the GAN network,I'm using it to generator a mnisit image,the environment in my computer is ubuntu16.04,tensorflow,python3.
The code can run without any error.But the result shows the network study nothing,through training,the output image is still noisy image.
Firstly I design a generator network:the input is 784 dimension's noisy data,through a hidden layer and rule it,generate a 784 dimension's image.
Then I design a discriminator network:the input is real image and fake image,through a hidden layer and rule it,the output is 1 dimension's logits.
Then I defined the generator_loss and discriminator_loss, then train generator and discriminator.It can run,but the result show the network study nothing, the loss can not convergence.
import tensorflow as tf
import numpy as np
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/home/zyw/data/tensor_mnist-master/MNIST_data/",one_hot=True)
batch_size = 100
G_in = tf.placeholder(tf.float32,[None,784])
G_h1 = tf.layers.dense(G_in, 128)
G_h1 = tf.maximum(0.01 * G_h1, G_h1)
G_out = tf.tanh(tf.layers.dense(G_h1, 784))
real = tf.placeholder(tf.float32,[None,784])
Dl0 = tf.layers.dense(G_out, 128)
Dl0 = tf.maximum(0.01 * Dl0, Dl0)
p0 = tf.layers.dense(Dl0, 1)
Dl1 = tf.layers.dense(real, 128)
Dl1 = tf.maximum(0.01 * Dl1, Dl1)
p1 = tf.layers.dense(Dl1, 1)
G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits =p0,labels=tf.ones_like(p0)*0.9))
D_real_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits =p1,labels=tf.ones_like(p1)*0.9))
D_fake_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits =p0,labels=tf.zeros_like(p0)))
D_total_loss = tf.add(D_fake_loss,D_real_loss)
G_train = tf.train.AdamOptimizer(0.01).minimize(G_loss)
D_train = tf.train.AdamOptimizer(0.01).minimize(D_total_loss)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(1000):
mnist_data,_ = mnist.train.next_batch(batch_size)
# noise_org = tf.random_normal([batch_size,784],stddev = 0.1,dtype = tf.float32)
noise_org = np.random.randn(batch_size, 784)
a,b,dloss= sess.run([D_real_loss,D_fake_loss,D_total_loss,G_train,D_train],feed_dict={G_in:noise_org,real:mnist_data})[:3]
if i%100==0:
print(a,b,dloss)
#test_generative_image
noise_org = np.random.randn(1, 784)
image = sess.run(G_out,feed_dict ={G_in:noise_org})
outimage = tf.reshape(image, [28,28])
plt.imshow(outimage.eval(),cmap='gray')
plt.show()
print('ok')
the result is:
0.80509 0.63548 1.44057
0.33512 0.20223 0.53735
0.332536 0.97737 1.30991
0.328048 0.814452 1.1425
0.326688 0.411907 0.738596
0.325864 0.570807 0.896671
0.325575 0.970406 1.29598
0.325421 1.02487 1.35029
0.325222 1.34089 1.66612
0.325217 0.747129 1.07235
I have added the modified code with the comments where I made the changes. Moreover, I have described about my changes below.
import tensorflow as tf
import numpy as np
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/home/zyw/data/tensor_mnist-master/MNIST_data/",one_hot=True)
batch_size = 100
#define the generator function
def generator(input):
G_h1 = tf.layers.dense(input, 128)
# G_h1 = tf.maximum(0.01 * G_h1, G_h1)
G_out = tf.sigmoid(tf.layers.dense(G_h1, 784)) # sigmoid function added
return G_out
#Define the discrminator function
def discriminator(input):
Dl0 = tf.layers.dense(input, 128)
# Dl0 = tf.maximum(0.01 * Dl0, Dl0)
p0 = tf.layers.dense(Dl0, 1)
return p0
#Generator
with tf.variable_scope('G'):
G_in = tf.placeholder(tf.float32, [None, 784])
G_out = generator(G_in)
real = tf.placeholder(tf.float32, [None, 784])
#Discrimnator that takes the real data
with tf.variable_scope('D'):
D1 = discriminator(real)
#Discriminator that takes fake data
with tf.variable_scope('D', reuse=True): # need to use the same copy of Discrminator
D2 = discriminator(G_out)
G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D2, labels=tf.ones_like(D2)))
D_real_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D1, labels=tf.ones_like(D1)))
D_fake_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D2, labels=tf.zeros_like(D2)))
D_total_loss = tf.add(D_fake_loss, D_real_loss)
vars = tf.trainable_variables() #all trainable variables
d_training_vars = [v for v in vars if v.name.startswith('D/')] # varibles associated with the discrminator
g_training_vars = [v for v in vars if v.name.startswith('G/')] # varibles associated with the generator
G_train = tf.train.AdamOptimizer(0.001).minimize(G_loss,var_list=g_training_vars) # only train the variables associated with the generator
D_train = tf.train.AdamOptimizer(0.001).minimize(D_total_loss,var_list=d_training_vars) # only train the variables associated with the discriminator
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(1000):
mnist_data, _ = mnist.train.next_batch(batch_size)
# noise_org = tf.random_normal([batch_size,784],stddev = 0.1,dtype = tf.float32)
noise_org = np.random.randn(batch_size, 784)
a, b, dloss = sess.run([D_real_loss, D_fake_loss, D_total_loss, G_train, D_train],feed_dict={G_in: noise_org, real: mnist_data})[:3]
if i % 100 == 0:
print(a, b, dloss)
# test_generative_image
noise_org = np.random.randn(1, 784)
image = sess.run(G_out, feed_dict={G_in: noise_org})
outimage = tf.reshape(image, [28, 28])
plt.imshow(outimage.eval(), cmap='gray')
plt.show()
print('ok')
Few points you should note when implementing a GAN,
Need to use the same copies of the discriminator (i.e share same
weights) when implementing the discriminator loss (in your case Dl0
and Dl1 should share same paraments).
Generator activation function should be sigmoid not tanh
since the output of the generator should only be varying between 0
and 1. (since its a image)
When training the discriminator, you should only train the variables that associated with the discriminator. Likewise, when training the generator you only should train the variables that associated with the generator.
Sometimes it is important to make sure that the discriminator is
more powerful than the generator, as otherwise, it would not have
sufficient capacity to learn to be able to distinguish accurately
between generated and real samples.
These are only the basic things of GANs that you should note. However, there are many other aspects that you should consider when developing a GAN. You can get a good basic idea of GANs by reading following two articles.
http://blog.aylien.com/introduction-generative-adversarial-networks-code-tensorflow/
http://blog.evjang.com/2016/06/generative-adversarial-nets-in.html
Hope this helps.
I am generally struggling with indexing tensors in tensorflow.
I have image data and additional scalar data. I can only use a single placeholder to input all the data to a Neural Network.
The images (img) are numpy arrays with shape (84,84,3) and I have data a with shape (2) and b with shape (1).
Now I create a single sample
sample = np.reshape(np.array([img,a,b]),(3,1)) #shape (3,1)
The placeholder is
input = tf.placeholder(dtype=tf.float32,shape=[None] + list(sample.shape))
Now when TF reads a batch of samples I would like to retrieve the batch of images, the batch of a, and the batch of b, because they need to be input in different locations in the Neural Network.
Here is a minimal example:
import tensorflow as tf
from tensorflow.contrib import layers
import numpy as np
#Numpy
img = np.random.rand(84,84,3)
a = np.random.rand(2)
b = np.random.rand(1)
sample = np.reshape(np.array([img,a,b]),(3,1)) #shape (3,1)
batch = np.repeat(np.expand_dims(sample,axis=0),32,axis=0) #shape (32,3,1)
#TF
input = tf.placeholder(dtype=tf.float32,shape=[None] + list(sample.shape))
#TODO:
tf_img = tf.#get image batch from input
tf_a = tf.#get a batch from input
tf_b = tf.#get b batch from input
out = layers.convolution2d(tf_img,num_outputs=64,kernel_size=8,stride=2,activation_fn=tf.nn.relu)
out = layers.flatten(out)
out = tf.concat([out,tf_a,tf_b])
out = layers.fully_connected(out,10,activation_fn=tf.nn.relu)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
_ = sess.run(out,feed_dict={input:batch})
How can I extract the individual parts of the input from a tensor with shape (?,3,1), use the image data to create an embedding and concatenate the other two parts to that output enbedding.
Is there a better way to input the data? My only constraint is that it has to be a single placeholder.
Here's a complete example for my comment above:
import numpy as np
import tensorflow as tf
im_height = 84
im_width = 84
im_channels = 3
a_len = 2
b_len = 1
np_img = np.random.rand(im_height, im_width, im_channels)
np_a = np.random.rand(a_len)
np_b = np.random.rand(b_len)
# flatten the input and concatenate to a single 1D numpy array
np_sample = np.concatenate((np_img.reshape(-1), np_a.reshape(-1), np_b.reshape(-1)), axis=0)
# construct a pseudo batch
np_batch = np.repeat(np_sample[np.newaxis, :], 32, axis=0)
tf_batch = tf.placeholder(shape=(None, im_height*im_width*im_channels + a_len + b_len), dtype=tf.float32)
img_stop = im_height*im_width*im_channels
a_stop = img_stop+a_len
# you could also use tf.slice(...) here
tf_img = tf.reshape(tf_batch[:, 0:img_stop], (-1, im_height, im_width, im_channels))
tf_a = tf.reshape(tf_batch[:, img_stop:a_stop], (-1, a_len))
tf_b = tf.reshape(tf_batch[:, a_stop:], (-1, b_len))
with tf.Session() as sess:
fetch_dict = {'img': tf_img, 'a': tf_a, 'b': tf_b}
feed_dict = {tf_batch: np_batch}
res = sess.run(fetch_dict, feed_dict=feed_dict)
assert(np.isclose(res['img'][0, ...], np_img).all())
assert(np.isclose(res['a'][0, :], np_a).all())
assert(np.isclose(res['b'][0, :], np_b).all())
However, this is at least as invasive as adding appropriate placeholders to the code. Additionally, it's much less readable, in my opinion.
The newest Tensorflow api about seq2seq model has included scheduled sampling:
https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper
https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledOutputTrainingHelper
The original paper of scheduled sampling can be found here:
https://arxiv.org/abs/1506.03099
I read the paper but I cannot understand the difference between ScheduledEmbeddingTrainingHelper and ScheduledOutputTrainingHelper. The documentation only says ScheduledEmbeddingTrainingHelper is a training helper that adds scheduled sampling while ScheduledOutputTrainingHelper is a training helper that adds scheduled sampling directly to outputs.
I wonder what's the difference between these two helpers?
I contacted the engineer behind this, and he responded:
The output sampler either emits the raw rnn output or the raw ground truth at that time step. The embedding sampler treats the rnn output as logits of a distribution and either emits the embedding lookup of a sampled id from that categorical distribution or the raw ground truth at that time step.
Here's a basic example of using ScheduledEmbeddingTrainingHelper, using TensorFlow 1.3 and some higher level tf.contrib APIs. It's a sequence2sequence model, where the decoder's initial hidden state is the final hidden state of the encoder. It shows only how to train on a single batch (and apparently the task is "reverse this sequence"). For actual training tasks, I suggest looking at tf.contrib.learn APIs such as learn_runner, Experiment and tf.estimator.Estimator.
import tensorflow as tf
import numpy as np
from tensorflow.python.layers.core import Dense
vocab_size = 7
embedding_size = 5
lstm_units = 10
src_batch = np.array([[1, 2, 3], [4, 5, 6]])
trg_batch = np.array([[3, 2, 1], [6, 5, 4]])
# *_seq will have shape (2, 3), *_seq_len will have shape (2)
source_seq = tf.placeholder(shape=(None, None), dtype=tf.int32)
target_seq = tf.placeholder(shape=(None, None), dtype=tf.int32)
source_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
target_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
# add Start of Sequence (SOS) tokens to each sequence
batch_size, sequence_size = tf.unstack(tf.shape(target_seq))
sos_slice = tf.zeros([batch_size, 1], dtype=tf.int32) # 0 = start of sentence token
decoder_input = tf.concat([sos_slice, target_seq], axis=1)
embedding_matrix = tf.get_variable(
name="embedding_matrix",
shape=[vocab_size, embedding_size],
dtype=tf.float32)
source_seq_embedded = tf.nn.embedding_lookup(embedding_matrix, source_seq) # shape=(2, 3, 5)
decoder_input_embedded = tf.nn.embedding_lookup(embedding_matrix, decoder_input) # shape=(2, 4, 5)
unused_encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
tf.contrib.rnn.LSTMCell(lstm_units),
source_seq_embedded,
sequence_length=source_seq_len,
dtype=tf.float32)
# Decoder:
# At each time step t and for each sequence in the batch, we get x_t by either
# (1) sampling from the distribution output_layer(t-1), or
# (2) reading from decoder_input_embedded.
# We do (1) with probability sampling_probability and (2) with 1 - sampling_probability.
# Using sampling_probability=0.0 is equivalent to using TrainingHelper (no sampling).
# Using sampling_probability=1.0 is equivalent to doing inference,
# where we don't supervise the decoder at all: output at t-1 is the input at t.
sampling_prob = tf.Variable(0.0, dtype=tf.float32)
helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
decoder_input_embedded,
target_seq_len,
embedding_matrix,
sampling_probability=sampling_prob)
output_layer = Dense(vocab_size)
decoder = tf.contrib.seq2seq.BasicDecoder(
tf.contrib.rnn.LSTMCell(lstm_units),
helper,
encoder_state,
output_layer=output_layer)
outputs, state, seq_len = tf.contrib.seq2seq.dynamic_decode(decoder)
loss = tf.contrib.seq2seq.sequence_loss(
logits=outputs.rnn_output,
targets=target_seq,
weights=tf.ones(trg_batch.shape))
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
optimizer=tf.train.AdamOptimizer,
learning_rate=0.001)
with tf.Session() as session:
session.run(tf.global_variables_initializer())
_, _loss = session.run([train_op, loss], {
source_seq: src_batch,
target_seq: trg_batch,
source_seq_len: [3, 3],
target_seq_len: [3, 3],
sampling_prob: 0.5
})
print("Loss: " + str(_loss))
For ScheduledOutputTrainingHelper, I would expect to just swap out the helper and use:
helper = tf.contrib.seq2seq.ScheduledOutputTrainingHelper(
target_seq,
target_seq_len,
sampling_probability=sampling_prob)
However this gives an error, since the LSTM cell expects a multidimensional input per timestep (of shape (batch_size, input_dims)). I will raise an issue in GitHub to find out if this is a bug, or there's some other way to use ScheduledOutputTrainingHelper.
This might also help you. This is for the case where you want to do scheduled sampling at each decoding step separately.
import tensorflow as tf
import numpy as np
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gen_array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops.distributions import categorical
from tensorflow.python.ops.distributions import bernoulli
batch_size = 64
vocab_size = 50000
emb_dim = 128
output = tf.get_variable('output',
initializer=tf.constant(np.random.rand(batch_size,vocab_size)))
base_next_inputs = tf.get_variable('input',
initializer=tf.constant(np.random.rand(batch_size,emb_dim)))
embedding = tf.get_variable('embedding',
initializer=tf.constant(np.random.rand(vocab_size,emb_dim)))
select_sampler = bernoulli.Bernoulli(probs=0.99, dtype=tf.bool)
select_sample = select_sampler.sample(sample_shape=batch_size,
seed=123)
sample_id_sampler = categorical.Categorical(logits=output)
sample_ids = array_ops.where(
select_sample,
sample_id_sampler.sample(seed=123),
gen_array_ops.fill([batch_size], -1))
where_sampling = math_ops.cast(
array_ops.where(sample_ids > -1), tf.int32)
where_not_sampling = math_ops.cast(
array_ops.where(sample_ids <= -1), tf.int32)
sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling)
inputs_not_sampling = array_ops.gather_nd(base_next_inputs,
where_not_sampling)
sampled_next_inputs = tf.nn.embedding_lookup(embedding,
sample_ids_sampling)
base_shape = array_ops.shape(base_next_inputs)
result1 = array_ops.scatter_nd(indices=where_sampling,
updates=sampled_next_inputs, shape=base_shape)
result2 = array_ops.scatter_nd(indices=where_not_sampling,
updates=inputs_not_sampling, shape=base_shape)
result = result1 + result2
I used the tensorflow documentation code to make this example.
https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/contrib/seq2seq/python/ops/helper.py