I'm new to python and tensorflow. I'm now testing Improved WGAN code from https://github.com/igul222/improved_wgan_training
After adjusting the code to python 3.6, it still gives "NameError: name 'train_gen' is not defined" when I ran it, although there wasn't warning from pylint.
Can anyone help me with it?
The version of python I'm using is 3.6. There were many syntax differences from 2.7. I've already changed a lot to make it work. And I am running Tensorflow in a virtual environment. Still couldn't figure out this one.
import os, sys
sys.path.append(os.getcwd())
import time
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import sklearn.datasets
import tensorflow as tf
import tflib as lib
import tflib.ops.linear
import tflib.ops.conv2d
import tflib.ops.batchnorm
import tflib.ops.deconv2d
import tflib.save_images
import tflib.mnist
import tflib.plot
MODE = 'wgan-gp' # dcgan, wgan, or wgan-gp
DIM = 64 # Model dimensionality
BATCH_SIZE = 50 # Batch size
CRITIC_ITERS = 5 # For WGAN and WGAN-GP, number of critic iters per gen iter
LAMBDA = 10 # Gradient penalty lambda hyperparameter
ITERS = 200000 # How many generator iterations to train for
OUTPUT_DIM = 784 # Number of pixels in MNIST (28*28)
lib.print_model_settings(locals().copy())
def LeakyReLU(x, alpha=0.2):
return tf.maximum(alpha*x, x)
def ReLULayer(name, n_in, n_out, inputs):
output = lib.ops.linear.Linear(
name+'.Linear',
n_in,
n_out,
inputs,
initialization='he'
)
return tf.nn.relu(output)
def LeakyReLULayer(name, n_in, n_out, inputs):
output = lib.ops.linear.Linear(
name+'.Linear',
n_in,
n_out,
inputs,
initialization='he'
)
return LeakyReLU(output)
def Generator(n_samples, noise=None):
if noise is None:
noise = tf.random_normal([n_samples, 128])
output = lib.ops.linear.Linear('Generator.Input', 128, 4*4*4*DIM, noise)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Generator.BN1', [0], output)
output = tf.nn.relu(output)
output = tf.reshape(output, [-1, 4*DIM, 4, 4])
output = lib.ops.deconv2d.Deconv2D('Generator.2', 4*DIM, 2*DIM, 5, output)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Generator.BN2', [0,2,3], output)
output = tf.nn.relu(output)
output = output[:,:,:7,:7]
output = lib.ops.deconv2d.Deconv2D('Generator.3', 2*DIM, DIM, 5, output)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Generator.BN3', [0,2,3], output)
output = tf.nn.relu(output)
output = lib.ops.deconv2d.Deconv2D('Generator.5', DIM, 1, 5, output)
output = tf.nn.sigmoid(output)
return tf.reshape(output, [-1, OUTPUT_DIM])
def Discriminator(inputs):
output = tf.reshape(inputs, [-1, 1, 28, 28])
output = lib.ops.conv2d.Conv2D('Discriminator.1',1,DIM,5,output,stride=2)
output = LeakyReLU(output)
output = lib.ops.conv2d.Conv2D('Discriminator.2', DIM, 2*DIM, 5, output, stride=2)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Discriminator.BN2', [0,2,3], output)
output = LeakyReLU(output)
output = lib.ops.conv2d.Conv2D('Discriminator.3', 2*DIM, 4*DIM, 5, output, stride=2)
if MODE == 'wgan':
output = lib.ops.batchnorm.Batchnorm('Discriminator.BN3', [0,2,3], output)
output = LeakyReLU(output)
output = tf.reshape(output, [-1, 4*4*4*DIM])
output = lib.ops.linear.Linear('Discriminator.Output', 4*4*4*DIM, 1, output)
return tf.reshape(output, [-1])
real_data = tf.placeholder(tf.float32, shape=[BATCH_SIZE, OUTPUT_DIM])
fake_data = Generator(BATCH_SIZE)
disc_real = Discriminator(real_data)
disc_fake = Discriminator(fake_data)
gen_params = lib.params_with_name('Generator')
disc_params = lib.params_with_name('Discriminator')
if MODE == 'wgan':
gen_cost = -tf.reduce_mean(disc_fake)
disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
gen_train_op = tf.train.RMSPropOptimizer(
learning_rate=5e-5
).minimize(gen_cost, var_list=gen_params)
disc_train_op = tf.train.RMSPropOptimizer(
learning_rate=5e-5
).minimize(disc_cost, var_list=disc_params)
clip_ops = []
for var in lib.params_with_name('Discriminator'):
clip_bounds = [-.01, .01]
clip_ops.append(
tf.assign(
var,
tf.clip_by_value(var, clip_bounds[0], clip_bounds[1])
)
)
clip_disc_weights = tf.group(*clip_ops)
elif MODE == 'wgan-gp':
gen_cost = -tf.reduce_mean(disc_fake)
disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
alpha = tf.random_uniform(
shape=[BATCH_SIZE,1],
minval=0.,
maxval=1.
)
differences = fake_data - real_data
interpolates = real_data + (alpha*differences)
gradients = tf.gradients(Discriminator(interpolates), [interpolates])[0]
slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1]))
gradient_penalty = tf.reduce_mean((slopes-1.)**2)
disc_cost += LAMBDA*gradient_penalty
gen_train_op = tf.train.AdamOptimizer(
learning_rate=1e-4,
beta1=0.5,
beta2=0.9
).minimize(gen_cost, var_list=gen_params)
disc_train_op = tf.train.AdamOptimizer(
learning_rate=1e-4,
beta1=0.5,
beta2=0.9
).minimize(disc_cost, var_list=disc_params)
clip_disc_weights = None
elif MODE == 'dcgan':
gen_cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
disc_fake,
tf.ones_like(disc_fake)
))
disc_cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
disc_fake,
tf.zeros_like(disc_fake)
))
disc_cost += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
disc_real,
tf.ones_like(disc_real)
))
disc_cost /= 2.
gen_train_op = tf.train.AdamOptimizer(
learning_rate=2e-4,
beta1=0.5
).minimize(gen_cost, var_list=gen_params)
disc_train_op = tf.train.AdamOptimizer(
learning_rate=2e-4,
beta1=0.5
).minimize(disc_cost, var_list=disc_params)
clip_disc_weights = None
# For saving samples
fixed_noise = tf.constant(np.random.normal(size=(128, 128)).astype('float32'))
fixed_noise_samples = Generator(128, noise=fixed_noise)
def generate_image(frame, true_dist):
samples = session.run(fixed_noise_samples)
lib.save_images.save_images(
samples.reshape((128, 28, 28)),
'samples_{}.png'.format(frame)
)
# Dataset iterator
train_gen, dev_gen, test_gen = lib.mnist.load(BATCH_SIZE, BATCH_SIZE)
def inf_train_gen():
while True:
for images, targets in train_gen():
yield images
# Train loop
with tf.Session() as session:
session.run(tf.initialize_all_variables())
gen = inf_train_gen()
for iteration in range(ITERS):
start_time = time.time()
if iteration > 0:
_ = session.run(gen_train_op)
if MODE == 'dcgan':
disc_iters = 1
else:
disc_iters = CRITIC_ITERS
for i in range(disc_iters):
_data = gen.__next__()
_disc_cost, _ = session.run(
[disc_cost, disc_train_op],
feed_dict={real_data: _data}
)
if clip_disc_weights is not None:
_ = session.run(clip_disc_weights)
lib.plot.plot('train disc cost', _disc_cost)
lib.plot.plot('time', time.time() - start_time)
# Calculate dev loss and generate samples every 100 iters
if iteration % 100 == 99:
dev_disc_costs = []
for images,_ in dev_gen():
_dev_disc_cost = session.run(
disc_cost,
feed_dict={real_data: images}
)
dev_disc_costs.append(_dev_disc_cost)
lib.plot.plot('dev disc cost', np.mean(dev_disc_costs))
generate_image(iteration, _data)
# Write logs every 100 iters
if (iteration < 5) or (iteration % 100 == 99):
lib.plot.flush()
lib.plot.tick()
This is the section containing the error name.
# Dataset iterator
train_gen, dev_gen, test_gen = lib.mnist.load(BATCH_SIZE, BATCH_SIZE)
def inf_train_gen():
while True:
for images, targets in train_gen():
yield images
And here is the error.
Traceback (most recent call last):
File "<stdin>", line 13, in <module>
File "<stdin>", line 3, in inf_train_gen
NameError: name 'train_gen' is not defined
Attempt 1:
I believe it's just because you are saying for images, targets in train_gen(): when you should be saying for images, targets in train_gen:
In a nutshell, the brackets suggest you are calling a function, which leads Python to raise the exception NameError: name 'train_gen' is not defined because there is no function with the name train_gen defined.
In the future, your code should be minimal, because you have pasted an enormous amount of code which makes it very hard to debug/see what you're doing.
Attempt 2:
Upon second review of the code (this is a good reason why you need to make your examples as small as possible), I have realised that is is possible you are maybe importing this code from elsewhere?
When you are making the first assignment to train_gen this is outside the function scope. It is possible then that when you go to call the function train_gen is no longer defined, which is why you get your error. This can occur for a number of reasons. After having reviewed the code a little there are various issues I can see (bad practice mostly).
It is generally not a good idea to use global variables within a function as you have in inf_train_gen, if a function needs an argument to run properly, then it should be passed as an argument. This is because if we have a problem with a variable (as we do now) we can normally see where this variable comes from and what uses it, but if all function rely on the globally scoped variable, any number of functions could delete it, change it, etc.
Right now I have no idea what has happened to the variable train_gen, I would suggest printing out the variable at different intervals and seeing if you can see which function call is causing issues and in the future stay away from globally scoped variables unless absolutely necessary, it makes it near-impossible to debug.
Related
I am trying to get an actor critic variant of the pendulum running, but I seem to be running into a particular problem.
RuntimeError: Found dtype Double but expected Float
I saw this had come up multiple times before so I have been through those and have attempted to change the data types of my loss (kept in comments) but it is still not working. Could anyone point out how to resolve this so that I can learn from it?
Full code below
import gym, os
import numpy as np
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from collections import namedtuple
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
class ActorCritic(nn.Module):
"""
Implementing both heads of the actor critic model
"""
def __init__(self, state_space, action_space):
super(ActorCritic, self).__init__()
self.state_space = state_space
self.action_space = action_space
# HL 1
self.linear1 = nn.Linear(self.state_space, 128)
# HL 2
self.linear2 = nn.Linear(128, 256)
# Outputs
self.critic_head = nn.Linear(256, 1)
self.action_mean = nn.Linear(256, self.action_space)
self.action_std = nn.Linear(256, self.action_space)
# Saving
self.saved_actions = []
self.rewards = []
# Optimizer
self.optimizer = optim.Adam(self.parameters(), lr = 1e-3)
self.eps = np.finfo(np.float32).eps.item()
def forward(self, state):
"""
Forward pass for both actor and critic
"""
# State to Layer 1
l1_output = F.relu(self.linear1(state))
# Layer 1 to Layer 2
l2_output = F.relu(self.linear2(l1_output))
# Layer 2 to Action
mean = self.action_mean(l2_output)
std = self.action_std(l2_output)
std = torch.clamp(std, min=LOG_SIG_MIN, max = LOG_SIG_MAX)
std = std.exp()
# Layer 2 to Value
value_est = self.critic_head(l2_output)
return value_est, mean, std
def select_action(self,state):
state = torch.from_numpy(state).float().unsqueeze(0)
value_est, mean, std = self.forward(state)
value_est = value_est.reshape(-1)
# Make prob Normal dist
dist = Normal(mean, std)
action = dist.sample()
action = torch.tanh(action)
ln_prob = dist.log_prob(action)
ln_prob = ln_prob.sum()
self.saved_actions.append(SavedAction(ln_prob, value_est))
action = action.numpy()
return action[0]
def compute_returns(self, gamma): # This is the error causing code
"""
Calculate losses and do backprop
"""
R = 0
saved_actions = self.saved_actions
policy_losses = []
value_losses = []
returns = []
for r in self.rewards[::-1]:
# Discount value
R = r + gamma*R
returns.insert(0,R)
returns = torch.tensor(returns)
returns = (returns - returns.mean())/(returns.std()+self.eps)
for (log_prob, value), R in zip(saved_actions, returns):
advantage = R - value.item()
advantage = advantage.type(torch.FloatTensor)
policy_losses.append(-log_prob*advantage)
value_losses.append(F.mse_loss(value, torch.tensor([R])))
self.optimizer.zero_grad()
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
loss = loss.type(torch.FloatTensor)
loss.backward()
self.optimizer.step()
del self.rewards[:]
del self.saved_actions[:]
env = gym.make("Pendulum-v0")
state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]
# Train Expert AC
model = ActorCritic(state_space, action_space)
train = True
if train == True:
# Main loop
window = 50
reward_history = []
for ep in count():
state = env.reset()
ep_reward = 0
for t in range(1,1000):
if ep%50 == 0:
env.render()
action = model.select_action(state)
state, reward, done, _ = env.step(action)
model.rewards.append(reward)
ep_reward += reward
if done:
break
print(reward)
model.compute_returns(0.99) # Error begins here
reward_history.append(ep_reward)
# Result information
if ep % 50 == 0:
mean = np.mean(reward_history[-window:])
print(f"Episode: {ep} Last Reward: {ep_reward} Rolling Mean: {mean}")
if np.mean(reward_history[-100:])>199:
print(f"Environment solved at episode {ep}, average run length > 200")
break
Complete error log, some elements redacted for privacy. Originally the loop actor critic and main loop were in separate files. Comments added to appropriate error causing lines.
Traceback (most recent call last):
File "pendulum.py", line 59, in <module>
model.compute_returns(0.99)
File "/home/x/Software/git/x/x/solvers/actorcritic_cont.py", line 121, in compute_returns
loss.backward()
File "/home/x/.local/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/x/.local/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: Found dtype Double but expected Float
Answering here in case anyone has similar issues in the future.
The output of the reward in OpenAI Gym Pendulum-v0 is a double, so when you compute the return over the episode you need to change that to a float tensor.
I did this just by:
returns = torch.tensor(returns)
returns = (returns - returns.mean())/(returns.std()+self.eps)
returns = returns.type(torch.FloatTensor)
When i calculate inception score, i got NaN most of the time.
Trying to investigate why it happen i found that running the network twice on the same images can lead for some of the images to totally different results (difference greater than 0.9 while the maximum difference can be 1), the images which got high difference changed from run to run.
My GPU is 2080ti, i use Ubuntu with tensorflow=1.13.1.
i try to change drivers, tensorflow version, run form docker, the same problem happen all the time.
I have another server at the university which has the same GPU (2080ti), and when i try to run there the problem disappear.
Thanks for the help.
my script
# Code derived from tensorflow/tensorflow/models/image/imagenet/classify_image.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os.path
import tarfile
import numpy as np
from six.moves import urllib
import tensorflow as tf
import sys
import warnings
from scipy import linalg
MODEL_DIR = '/tmp/imagenet'
DATA_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
softmax = None
pool3 = None
# Call this function with list of images. Each of elements should be a
# numpy array with values ranging from 0 to 255.
def get_features(images):
assert ((images.shape[3]) == 3)
assert (np.max(images) > 10)
assert (np.min(images) >= 0.0)
images = images.astype(np.float32)
bs = 100
sess = tf.get_default_session()
preds = []
for inp in np.array_split(images, round(images.shape[0] / bs)):
sys.stdout.write(".")
sys.stdout.flush()
pred = sess.run(softmax, {'InputTensor:0': inp})
preds.append(pred)
preds = np.concatenate(preds, 0)
return preds
# This function is called automatically.
def _init_inception():
global softmax
global pool3
if not os.path.exists(MODEL_DIR):
os.makedirs(MODEL_DIR)
filename = DATA_URL.split('/')[-1]
filepath = os.path.join(MODEL_DIR, filename)
if not os.path.exists(filepath):
def _progress(count, block_size, total_size):
sys.stdout.write('\r>> Downloading %s %.1f%%' % (
filename, float(count * block_size) / float(total_size) * 100.0))
sys.stdout.flush()
filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
print()
statinfo = os.stat(filepath)
print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
tarfile.open(filepath, 'r:gz').extractall(MODEL_DIR)
with tf.gfile.GFile(os.path.join(
MODEL_DIR, 'classify_image_graph_def.pb'), 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
# Import model with a modification in the input tensor to accept arbitrary
# batch size.
input_tensor = tf.placeholder(tf.float32, shape=[None, None, None, 3],
name='InputTensor')
_ = tf.import_graph_def(graph_def, name='inception_v3',
input_map={'ExpandDims:0': input_tensor})
# Works with an arbitrary minibatch size.
pool3 = tf.get_default_graph().get_tensor_by_name('inception_v3/pool_3:0')
ops = pool3.graph.get_operations()
for op_idx, op in enumerate(ops):
if 'inception_v3' in op.name:
for o in op.outputs:
shape = o.get_shape()
shape = [s.value for s in shape]
new_shape = []
for j, s in enumerate(shape):
if s == 1 and j == 0:
new_shape.append(None)
else:
new_shape.append(s)
o.set_shape(tf.TensorShape(new_shape))
w = tf.get_default_graph().get_operation_by_name("inception_v3/softmax/logits/MatMul").inputs[1]
logits = tf.matmul(tf.squeeze(pool3, [1, 2]), w)
softmax = tf.nn.softmax(logits)
_init_inception()
if __name__ =='__main__':
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
with tf.Session() as sess:
preds1 = get_features(x_train)
preds2 = get_features(x_train)
print(abs(preds1-preds2).max())
I am currently playing around and learning about distributed tensorflow.
I recently created a cluster with One GPU server(two cards) - One CPU server
I was browsing through various articles and in the TensorFlow distributed guide I saw that distribution happened across cards by explicitly calling them with names.
https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py
but here no cluster are being created.
Can i create a TensorFlow cluster and then specify which card the code should run on?
If yes, does the below look correct?
In one github question who's link i dont have right now but the code below, the card is specified under with tf.device(replica_device_setter) but when i try to do that my code throws an error stating "Cannot assign a device for operation 'dummy_queue_Close_1': Could not satisfy explicit device specification '/job:ps/task:0/device:GPU:0' because no supported kernel for GPU devices is available."
Is this because i am assinging tasks which were supposed to happen on a CPU but instead as i gave with tf.device('/gpu:0/') it throws the error ?
Also I cant share my official code but it looks very similar to the below code which i took for reference.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy
import tensorflow as tf
tf.app.flags.DEFINE_string("ps_hosts", "localhost:2222", "...")
tf.app.flags.DEFINE_string("worker_hosts", "localhost:2223", "...")
tf.app.flags.DEFINE_string("job_name", "", "...")
tf.app.flags.DEFINE_integer("task_index", 0, "...")
tf.app.flags.DEFINE_integer('gpu_cards', 4, 'Number of GPU cards in a machine to use.')
FLAGS = tf.app.flags.FLAGS
def dense_to_one_hot(labels_dense, num_classes = 10) :
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = numpy.arange(num_labels) * num_classes
labels_one_hot = numpy.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
def run_training(server, cluster_spec, num_workers) :
is_chief = (FLAGS.task_index == 0)
with tf.Graph().as_default():
with tf.device(tf.train.replica_device_setter(cluster = cluster_spec)) :
with tf.device('/cpu:0') :
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0), trainable = False)
with tf.device('/gpu:%d' % (FLAGS.task_index % FLAGS.gpu_cards)) :
# Create the model
x = tf.placeholder("float", [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.nn.softmax(tf.matmul(x, W) + b)
# Define loss and optimizer
y_ = tf.placeholder("float", [None, 10])
cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
opt = tf.train.GradientDescentOptimizer(0.01)
opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate = num_workers,
replica_id = FLAGS.task_index, total_num_replicas = num_workers)
train_step = opt.minimize(cross_entropy, global_step = global_step)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
init_token_op = opt.get_init_tokens_op()
chief_queue_runner = opt.get_chief_queue_runner()
init = tf.initialize_all_variables()
sv = tf.train.Supervisor(is_chief = is_chief,
init_op = init,
global_step = global_step)
# Create a session for running Ops on the Graph.
config = tf.ConfigProto(allow_soft_placement = True)
sess = sv.prepare_or_wait_for_session(server.target, config = config)
if is_chief:
sv.start_queue_runners(sess, [chief_queue_runner])
sess.run(init_token_op)
for i in range(100000):
source_data = numpy.random.normal(loc = 0.0, scale = 1.0, size = (100, 784))
labels_dense = numpy.clip(numpy.sum(source_data, axis = 1) / 5 + 5, 0, 9).astype(int)
labels_one_hot = dense_to_one_hot(labels_dense)
_, cost, acc, step = sess.run([train_step, cross_entropy, accuracy, global_step], feed_dict = { x: source_data, y_ : labels_one_hot })
print("[%d]: cost=%.2f, accuracy=%.2f" % (step, cost, acc))
def main(_) :
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
num_workers = len(worker_hosts)
print("gup_cards=%d; num_worders=%d" % (FLAGS.gpu_cards, num_workers))
cluster_spec = tf.train.ClusterSpec({ "ps":ps_hosts, "worker" : worker_hosts })
server = tf.train.Server(cluster_spec, job_name = FLAGS.job_name, task_index = FLAGS.task_index)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker" :
run_training(server, cluster_spec, num_workers)
if __name__ == '__main__' :
tf.app.run()
I found a way to do this it sounds very simple, and it is very simple.
I created a TensorFlow cluster in the same way and passed the n_workers parameter to the cluster, and I called different instances of the code with an extra parameter for CUDA_VISIBLE_DEVICES.
CUDA_VISIBLE_DEVICES is an environment variable which can be used to restrict the vision of TensorFlow or any DL framework to a limited number of cards.
CUDA_VISIBLE_DEVICES value can range from -1 to n (where n is the number of GPUs).
-1 indicates no cards to use
n indicates nth card to use
I hope someone who is looking for a similar answer can find this useful.
I have been using TensorFlow for a reasonable length of time now. and believed I had a thorough understanding of how a TensorFlow graph works and executes within a session. However, I have written all of my TensorFlow models in a script-like fashion as such:
import tensorflow as tf
import DataWorker
import Constants
x = tf.placeholder(tf.float32, [None, Constants.sequenceLength, DataWorker.numFeatures])
y = tf.placeholder(tf.float32, [None, 1])
xTensors = tf.unstack(x, axis=1) # [seqLength tensors of shape (batchSize, numFeatures)]
W = tf.Variable(tf.random_normal([Constants.numHidden, 1])) # Weighted matrix
b = tf.Variable(tf.random_normal([1])) # Bias
cell = tf.contrib.rnn.BasicLSTMCell(Constants.numHidden, forget_bias=Constants.forgetBias)
outputs, finalState = tf.nn.static_rnn(cell, xTensors, dtype=tf.float32)
# predictions = [tf.add(tf.matmul(output, W), b) for output in outputs] # List of predictions after each time step
prediction = tf.add(tf.matmul(outputs[-1], W), b) # Prediction after final time step
prediction = tf.tanh(prediction) # Activation
mse = tf.losses.mean_squared_error(predictions=prediction, labels=y) # Mean loss over entire batch
accuracy = tf.reduce_mean(1 - (tf.abs(y - prediction) / DataWorker.labelRange)) # Accuracy over entire batch
optimiser = tf.train.AdamOptimizer(Constants.learningRate).minimize(mse) # Backpropagation
with tf.Session() as session:
session.run(tf.global_variables_initializer())
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
dict = {x: batchX, y: batchY}
session.run(optimiser, dict)
if batchNum % 1000 == 0 or epochComplete:
batchLoss = session.run(mse, dict)
batchAccuracy = session.run(accuracy, dict)
print("Iteration:", batchNum)
print(batchLoss)
print(str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
testAccuracy = session.run(accuracy, {x: testX, y: testY})
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
But now, for practicality and readability, I want to implement my model as a class, but have encountered many problems with initializing my variables, etc.
This is the closest I have got to implementing the above example using my own LSTM class
Model.py
import tensorflow as tf
import Constants
import DataWorker # Remove this dependency
class LSTM():
"""docstring."""
def __init__(self,
inputDimensionList,
outputDimensionList,
numLayers=Constants.numLayers,
numHidden=Constants.numHidden,
learningRate=Constants.learningRate,
forgetBias=Constants.forgetBias
):
"""docstring."""
self.batchInputs = tf.placeholder(tf.float32, [None] + inputDimensionList)
self.batchLabels = tf.placeholder(tf.float32, [None] + outputDimensionList)
self.weightedMatrix = tf.Variable(tf.random_normal([numHidden] + outputDimensionList))
self.biasMatrix = tf.Variable(tf.random_normal(outputDimensionList))
self.cell = tf.contrib.rnn.BasicLSTMCell(numHidden, forget_bias=forgetBias)
self.numLayers = numLayers
self.numHidden = numHidden
self.learningRate = learningRate
self.forgetBias = forgetBias
self.batchDict = {}
self.batchInputTensors = None
self.batchOutputs = None # All needed as instance variables?
self.batchFinalStates = None
self.batchPredictions = None
self.batchLoss = None
self.batchAccuracy = None
self.initialised = False
self.session = tf.Session()
# Take in activation, loss and optimiser FUNCTIONS as args
def execute(self, command):
"""docstring."""
return self.session.run(command, self.batchDict)
def setBatchDict(self, inputs, labels):
"""docstring."""
self.batchDict = {self.batchInputs: inputs, self.batchLabels: labels}
self.batchInputTensors = tf.unstack(self.batchInputs, axis=1)
def processBatch(self):
"""docstring."""
self.batchOutputs, self.batchFinalState = tf.nn.static_rnn(self.cell, self.batchInputTensors, dtype=tf.float32)
pred = tf.tanh(tf.add(tf.matmul(self.batchOutputs[-1], self.weightedMatrix), self.biasMatrix))
mse = tf.losses.mean_squared_error(predictions=pred, labels=self.batchLabels)
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
if not self.initialised:
self.session.run(tf.global_variables_initializer())
self.initialised = True
with tf.variable_scope("model") as scope:
if self.initialised:
scope.reuse_variables()
self.execute(optimiser)
self.batchPredictions = self.execute(pred)
self.batchLoss = self.execute(tf.losses.mean_squared_error(predictions=self.batchPredictions, labels=self.batchLabels))
self.batchAccuracy = self.execute(tf.reduce_mean(1 - (tf.abs(self.batchLabels - self.batchPredictions) / DataWorker.labelRange)))
return self.batchPredictions, self.batchLabels, self.batchLoss, self.batchAccuracy
def kill(self):
"""docstring."""
self.session.close()
This class is quite messy, especially processBatch() as I have just been trying to get it to work before refining it.
I then run my model here:
Main.py
import DataWorker
import Constants
from Model import LSTM
inputDim = [Constants.sequenceLength, DataWorker.numFeatures]
outputDim = [1]
lstm = LSTM(inputDimensionList=inputDim, outputDimensionList=outputDim)
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
lstm.setBatchDict(batchX, batchY)
batchPredictions, batchLabels, batchLoss, batchAccuracy = lstm.runBatch()
if batchNum % 1000 == 0 or epochComplete:
print("Iteration:", batchNum)
print("Pred:", batchPredictions[-1], "\tLabel:", batchLabels[-1])
print("Loss:", batchLoss)
print("Accuracy:", str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
lstm.setBatchDict(testX, testY)
_, _, _, testAccuracy = lstm.runBatch()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
lstm.kill()
A single passthrough of the graph is executed fine, when all the variables are initialized, but it is on the second iteration where I get the error
ValueError: Variable rnn/basic_lstm_cell/kernel/Adam/ already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
I Googled this problem and learned that using scope.reuse_variables() should stop it trying to initialize the AdamOptimizer a second time, but cleary this isn't working how I have implemented it. How can I fix this issue?
As a side note, is my method of creating the TensorFlow session as an instance variable within my LSTM class acceptable, or should I create the session in Main and then pass it into the LSTM instance?
In general I wrap anything that creates variables under the hood with tf.make_template when doing object oriented model building.
However, you should avoid adding ops to the graph in a training loop, which looks like it's happening here. They will build up and cause problems, and likely give you incorrect results. Instead, define the graph (with inputs from tf.data, placeholders, or queues) and only loop over a session.run call. Even better, structure your code as an Estimator and this will be enforced.
This is part of my current python code for NN training in python using CNTK module
batch_axis = C.Axis.default_batch_axis()
input_seq_axis = C.Axis.default_dynamic_axis()
input_dynamic_axes = [batch_axis, input_seq_axis]
input_dynamic_axes2 = [batch_axis, input_seq_axis]
input = C.input_variable(n_ins, dynamic_axes=input_dynamic_axes, dtype=numpy.float32)
output = C.input_variable(n_outs, dynamic_axes=input_dynamic_axes2, dtype=numpy.float32)
dnn_model = cntk_model.create_model(input, hidden_layer_type, hidden_layer_size, n_outs)
loss = C.squared_error(dnn_model, output)
error = C.squared_error(dnn_model, output)
lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch)
momentum_schedule = C.momentum_schedule(current_momentum)
learner = C.adam(dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain = False, l1_regularization_weight=l1_reg, l2_regularization_weight= l2_reg)
trainer = C.Trainer(dnn_model, (loss, error), [learner])
And here is code for creating NN model
def create_model(features, hidden_layer_type, hidden_layer_size, n_out):
logger.debug('Creating cntk model')
assert len(hidden_layer_size) == len(hidden_layer_type)
n_layers = len(hidden_layer_size)
my_layers = list()
for i in xrange(n_layers):
if(hidden_layer_type[i] == 'TANH'):
my_layers.append(C.layers.Dense(hidden_layer_size[i], activation=C.tanh, init=C.layers.glorot_uniform()))
elif (hidden_layer_type[i] == 'LSTM'):
my_layers.append(C.layers.Recurrence(C.layers.LSTM(hidden_layer_size[i])))
else:
raise Exception('Unknown hidden layer type')
my_layers.append(C.layers.Dense(n_out, activation=None))
my_model = C.layers.Sequential([my_layers])
my_model = my_model(features)
return my_model
Now, I would like to change a backpropagation, so when the error is calculated not direct network output is used, but the output after some additional calculation. I tried to define something like this
def create_error_function(self, prediction, target):
prediction_denorm = C.element_times(prediction, self.std_vector)
prediction_denorm = C.plus(prediction_denorm, self.mean_vector)
prediction_denorm_rounded = C.round(C.element_times(prediction_denorm[0:5], C.round(prediction_denorm[5])))
prediction_denorm_rounded = C.element_divide(prediction_denorm_rounded, C.round(prediction_denorm[5]))
prediction_norm = C.minus(prediction_denorm_rounded, self.mean_vector[0:5])
prediction_norm = C.element_divide(prediction_norm, self.std_vector[0:5])
first = C.squared_error(prediction_norm, target[0:5])
second = C.minus(C.round(prediction_denorm[5]), self.mean_vector[5])
second = C.element_divide(second, self.std_vector[5])
return C.plus(first, C.squared_error(second, target[5]))
and use it instead standard squared_error.
And the part for NN training
dnn_model = cntk_model.create_model(input, hidden_layer_type, hidden_layer_size, n_outs)
error_function = cntk_model.ErrorFunction(cmp_mean_vector, cmp_std_vector)
loss = error_function.create_error_function(dnn_model, output)
error = error_function.create_error_function(dnn_model, output)
lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch)
momentum_schedule = C.momentum_schedule(current_momentum)
learner = C.adam(dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain = False, l1_regularization_weight=l1_reg,
l2_regularization_weight= l2_reg)
trainer = C.Trainer(dnn_model, (loss, error), [learner])
trainer.train_minibatch({input: temp_train_x, output: temp_train_y})
But after two epochs I start gettting always the same average loss, as my network is not learning
Every time you want to change how backprop works, you need to use stop_gradient. This is the only function whose gradient is different from the gradient of the operation of the forward pass. In the forward pass stop_gradient acts as identity. In the backward pass it blocks the gradient from propagating.
To do an operation f(x) on some x in the forward pass and pretend as if it never happened in the backward pass you need to do something like:
C.stop_gradient(f(x) - x) + x. In your case that would be
norm_features = C.stop_gradient(features/normalization - features) + features