I made my windows 10 jupyter notebook as a server and running some trains on it.
I've installed CUDA 9.0 and cuDNN properly, and python detects the GPU. This is what I've got on the anaconda prompt.
>>> torch.cuda.get_device_name(0)
'GeForce GTX 1070'
And I also placed my model and tensors on cuda by .cuda()
model = LogPPredictor(1, 58, 64, 128, 1, 'gsc')
if torch.cuda.is_available():
torch.set_default_tensor_type(torch.cuda.DoubleTensor)
model.cuda()
else:
torch.set_default_tensor_type(torch.FloatTensor)
list_train_loss = list()
list_val_loss = list()
acc = 0
mse = 0
optimizer = args.optim(model.parameters(),
lr=args.lr,
weight_decay=args.l2_coef)
data_train = DataLoader(args.dict_partition['train'],
batch_size=args.batch_size,
pin_memory=True,
shuffle=args.shuffle)
data_val = DataLoader(args.dict_partition['val'],
batch_size=args.batch_size,
pin_memory=True,
shuffle=args.shuffle)
for epoch in tqdm_notebook(range(args.epoch), desc='Epoch'):
model.train()
epoch_train_loss = 0
for i, batch in enumerate(data_train):
list_feature = torch.tensor(batch[0]).cuda()
list_adj = torch.tensor(batch[1]).cuda()
list_logP = torch.tensor(batch[2]).cuda()
list_logP = list_logP.view(-1,1)
optimizer.zero_grad()
list_pred_logP = model(list_feature, list_adj)
list_pred_logP.require_grad = False
train_loss = args.criterion(list_pred_logP, list_logP)
epoch_train_loss += train_loss.item()
train_loss.backward()
optimizer.step()
list_train_loss.append(epoch_train_loss/len(data_train))
model.eval()
epoch_val_loss = 0
with torch.no_grad():
for i, batch in enumerate(data_val):
list_feature = torch.tensor(batch[0]).cuda()
list_adj = torch.tensor(batch[1]).cuda()
list_logP = torch.tensor(batch[2]).cuda()
list_logP = list_logP.view(-1,1)
list_pred_logP = model(list_feature, list_adj)
val_loss = args.criterion(list_pred_logP, list_logP)
epoch_val_loss += val_loss.item()
list_val_loss.append(epoch_val_loss/len(data_val))
data_test = DataLoader(args.dict_partition['test'],
batch_size=args.batch_size,
pin_memory=True,
shuffle=args.shuffle)
model.eval()
with torch.no_grad():
logP_total = list()
pred_logP_total = list()
for i, batch in enumerate(data_val):
list_feature = torch.tensor(batch[0]).cuda()
list_adj = torch.tensor(batch[1]).cuda()
list_logP = torch.tensor(batch[2]).cuda()
logP_total += list_logP.tolist()
list_logP = list_logP.view(-1,1)
list_pred_logP = model(list_feature, list_adj)
pred_logP_total += list_pred_logP.tolist()
mse = mean_squared_error(logP_total, pred_logP_total)
But on the Process Manager of Windows, whenever I start training, only CPU usage goes up to 25% and GPU usage remains 0. How can I fix this???
I had a similar problem with using PyTorch on Cuda. After looking for possible solutions, I found the following post by Soumith himself that found it very helpful.
https://discuss.pytorch.org/t/gpu-supposed-to-be-used-but-isnt/2883
The bottom line is, at least in my case, I could not put enough load on GPUs. There was a bottleneck in my application. Try another example, or increase batch size; it should be OK.
Related
I was training a model in Colab, but, I shut down my computer and this training stoped. Every 5 epochs I save the weights. I think it is but I don't know how. How it's possible to continue the training with the weights previously saved?
Thanks.
When training a model in colab, training doesn't stop when you close you computer, it stops some time afterwards.
If you are saving the weights in colab, when colab closes everything is deleted.
If you have mounted your gdrive in colab and you save weights in gdrive, your weights will be there.
If your weights are in your gdrive you can continue training by loading your stored weights to your keras model simple by
model.load_weights('path_to_weights')
Thank you for your answer, #Ioannis Nasios. Yes, my weights are in 'gdrive'. I'm training a GAN network and I trying to figure out how to load these weights and continue the training. I saved the discriminator and generator weights and also gan_loss and discriminator_loss. Well, do I have to compile generator and discriminator networks, load weights and compile gan network with their loss? I think it could be a stupid question. It is my first time training a GAN network.
Here I post the code:
# Combined network
def get_gan_network(discriminator, shape, generator, optimizer, loss):
discriminator.trainable = False
gan_input = Input(shape=shape)
x = generator(gan_input)
gan_output = discriminator(x)
gan = Model(inputs=gan_input, outputs=[x,gan_output])
gan.compile(loss=[loss, "binary_crossentropy"],
loss_weights=[1., 1e-3],
optimizer=optimizer)
return gan
def train(x_train_lr, x_train_hr, x_test_lr, x_test_hr, epochs, batch_size, output_dir, model_save_dir, weights_save_dir):
loss = VGG_LOSS(image_shape)
batch_count = int(x_train_hr.shape[0] / batch_size)
#### SI LAS IMAGENES NO SON CUADRADAS ESTO DEBERIA CAMBIAR
shape_lr = (image_shape[0]//downscale_factor, image_shape[1]//downscale_factor, image_shape[2])
shape_hr = x_train_hr[0].shape
####
generator = Generator(shape_lr, shape_hr).generator()
discriminator = Discriminator(image_shape).discriminator()
optimizer = Utils_model.get_optimizer()
generator.compile(loss=loss.vgg_loss, optimizer=optimizer)
discriminator.compile(loss="binary_crossentropy", optimizer=optimizer)
gan = get_gan_network(discriminator, shape_lr, generator, optimizer, loss.vgg_loss)
loss_file = open(model_save_dir + '/losses.txt' , 'w+')
loss_file.close()
for e in range(1, epochs+1):
print ('-'*15, 'Epoch %d' % e, '-'*15)
for _ in tqdm(range(batch_count)):
rand_nums = np.random.randint(0, x_train_hr.shape[0], size=batch_size)
image_batch_hr = x_train_hr[rand_nums]
image_batch_lr = x_train_lr[rand_nums]
generated_images_sr = generator.predict(image_batch_lr)
real_data_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
fake_data_Y = np.random.random_sample(batch_size)*0.2
discriminator.trainable = True
d_loss_real = discriminator.train_on_batch(image_batch_hr, real_data_Y)
d_loss_fake = discriminator.train_on_batch(generated_images_sr, fake_data_Y)
discriminator_loss = 0.5 * np.add(d_loss_fake, d_loss_real)
rand_nums = np.random.randint(0, x_train_hr.shape[0], size=batch_size)
image_batch_hr = x_train_hr[rand_nums]
image_batch_lr = x_train_lr[rand_nums]
gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
discriminator.trainable = False
gan_loss = gan.train_on_batch(image_batch_lr, [image_batch_hr,gan_Y])
print("discriminator_loss : %f" % discriminator_loss)
print("gan_loss :", gan_loss)
gan_loss = str(gan_loss)
loss_file = open(model_save_dir + 'losses.txt' , 'a')
loss_file.write('epoch%d : gan_loss = %s ; discriminator_loss = %f\n' %(e, gan_loss, discriminator_loss) )
loss_file.close()
if e == 1 or e % 5 == 0:
Utils.plot_generated_images(output_dir, e, generator, x_test_hr, x_test_lr)
generator.save_weights(weights_save_dir + '%d_gen_weights.h5' % e)
discriminator.save_weights(weights_save_dir + '%d_dis_weights.h5' % e)
if e % 500 == 0 or e == epochs+1:
generator.save(model_save_dir + 'gen_model%d.h5' % e)
discriminator.save(model_save_dir + 'dis_model%d.h5' % e)
I am currently playing around and learning about distributed tensorflow.
I recently created a cluster with One GPU server(two cards) - One CPU server
I was browsing through various articles and in the TensorFlow distributed guide I saw that distribution happened across cards by explicitly calling them with names.
https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py
but here no cluster are being created.
Can i create a TensorFlow cluster and then specify which card the code should run on?
If yes, does the below look correct?
In one github question who's link i dont have right now but the code below, the card is specified under with tf.device(replica_device_setter) but when i try to do that my code throws an error stating "Cannot assign a device for operation 'dummy_queue_Close_1': Could not satisfy explicit device specification '/job:ps/task:0/device:GPU:0' because no supported kernel for GPU devices is available."
Is this because i am assinging tasks which were supposed to happen on a CPU but instead as i gave with tf.device('/gpu:0/') it throws the error ?
Also I cant share my official code but it looks very similar to the below code which i took for reference.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy
import tensorflow as tf
tf.app.flags.DEFINE_string("ps_hosts", "localhost:2222", "...")
tf.app.flags.DEFINE_string("worker_hosts", "localhost:2223", "...")
tf.app.flags.DEFINE_string("job_name", "", "...")
tf.app.flags.DEFINE_integer("task_index", 0, "...")
tf.app.flags.DEFINE_integer('gpu_cards', 4, 'Number of GPU cards in a machine to use.')
FLAGS = tf.app.flags.FLAGS
def dense_to_one_hot(labels_dense, num_classes = 10) :
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = numpy.arange(num_labels) * num_classes
labels_one_hot = numpy.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
def run_training(server, cluster_spec, num_workers) :
is_chief = (FLAGS.task_index == 0)
with tf.Graph().as_default():
with tf.device(tf.train.replica_device_setter(cluster = cluster_spec)) :
with tf.device('/cpu:0') :
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0), trainable = False)
with tf.device('/gpu:%d' % (FLAGS.task_index % FLAGS.gpu_cards)) :
# Create the model
x = tf.placeholder("float", [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.nn.softmax(tf.matmul(x, W) + b)
# Define loss and optimizer
y_ = tf.placeholder("float", [None, 10])
cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
opt = tf.train.GradientDescentOptimizer(0.01)
opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate = num_workers,
replica_id = FLAGS.task_index, total_num_replicas = num_workers)
train_step = opt.minimize(cross_entropy, global_step = global_step)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
init_token_op = opt.get_init_tokens_op()
chief_queue_runner = opt.get_chief_queue_runner()
init = tf.initialize_all_variables()
sv = tf.train.Supervisor(is_chief = is_chief,
init_op = init,
global_step = global_step)
# Create a session for running Ops on the Graph.
config = tf.ConfigProto(allow_soft_placement = True)
sess = sv.prepare_or_wait_for_session(server.target, config = config)
if is_chief:
sv.start_queue_runners(sess, [chief_queue_runner])
sess.run(init_token_op)
for i in range(100000):
source_data = numpy.random.normal(loc = 0.0, scale = 1.0, size = (100, 784))
labels_dense = numpy.clip(numpy.sum(source_data, axis = 1) / 5 + 5, 0, 9).astype(int)
labels_one_hot = dense_to_one_hot(labels_dense)
_, cost, acc, step = sess.run([train_step, cross_entropy, accuracy, global_step], feed_dict = { x: source_data, y_ : labels_one_hot })
print("[%d]: cost=%.2f, accuracy=%.2f" % (step, cost, acc))
def main(_) :
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
num_workers = len(worker_hosts)
print("gup_cards=%d; num_worders=%d" % (FLAGS.gpu_cards, num_workers))
cluster_spec = tf.train.ClusterSpec({ "ps":ps_hosts, "worker" : worker_hosts })
server = tf.train.Server(cluster_spec, job_name = FLAGS.job_name, task_index = FLAGS.task_index)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker" :
run_training(server, cluster_spec, num_workers)
if __name__ == '__main__' :
tf.app.run()
I found a way to do this it sounds very simple, and it is very simple.
I created a TensorFlow cluster in the same way and passed the n_workers parameter to the cluster, and I called different instances of the code with an extra parameter for CUDA_VISIBLE_DEVICES.
CUDA_VISIBLE_DEVICES is an environment variable which can be used to restrict the vision of TensorFlow or any DL framework to a limited number of cards.
CUDA_VISIBLE_DEVICES value can range from -1 to n (where n is the number of GPUs).
-1 indicates no cards to use
n indicates nth card to use
I hope someone who is looking for a similar answer can find this useful.
We are using tensorflow library for face recognition. Our code works fine for single images. But when we run it as an API, the prediction time increases for every subsequent request. This happens because it searches for previously predicted images as well which should ideally not happen. Please find below the code I am using.
def train:
with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess:
test_set = _get_test_data(input_directory)
images, labels = _load_images_and_labels(test_set, image_size=160, batch_size=batch_size,
num_threads=4, num_epochs=1)
_load_model(model_filepath=model_path)
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)
images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
embedding_layer = tf.get_default_graph().get_tensor_by_name("embeddings:0")
phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess,coord=coord)
emb_array, label_array = _create_embeddings(embedding_layer, images, labels, images_placeholder,
phase_train_placeholder, sess)
classifier_filename = classifier_output_path
class_name, prob = _evaluate_classifier(emb_array, label_array, classifier_filename)
coord.request_stop()
coord.join(threads)
def _create_embeddings(embedding_layer, images, labels, images_placeholder, phase_train_placeholder, sess):
emb_array = None
label_array = None
try:
i = 0
while True:
print("batch images")
batch_images, batch_labels = sess.run([images, labels])
print('Processing iteration {} batch of size: {}'.format(i, len(batch_labels)))
emb = sess.run(embedding_layer,
feed_dict={images_placeholder: batch_images, phase_train_placeholder: False})
emb_array = np.concatenate([emb_array, emb]) if emb_array is not None else emb
label_array = np.concatenate([label_array, batch_labels]) if label_array is not None else batch_labels
i += 1
except tf.errors.OutOfRangeError:
pass
return emb_array, label_array
It searches for previously predicted images at
`batch_images, batch_labels = sess.run([images, labels])`
in the create embedding function. I think this is the problem of some unclosed threads because of which sess.run runs for all queued threads. Can anyone help me with this
During debugging I found that previously predicted images information was in default graph which is picked during execution of following lines
images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
embedding_layer = tf.get_default_graph().get_tensor_by_name("embeddings:0")
so by resetting graph before start of session will solve the problem of scanning previously predicted images as
tf.reset_default_graph()
I have been using TensorFlow for a reasonable length of time now. and believed I had a thorough understanding of how a TensorFlow graph works and executes within a session. However, I have written all of my TensorFlow models in a script-like fashion as such:
import tensorflow as tf
import DataWorker
import Constants
x = tf.placeholder(tf.float32, [None, Constants.sequenceLength, DataWorker.numFeatures])
y = tf.placeholder(tf.float32, [None, 1])
xTensors = tf.unstack(x, axis=1) # [seqLength tensors of shape (batchSize, numFeatures)]
W = tf.Variable(tf.random_normal([Constants.numHidden, 1])) # Weighted matrix
b = tf.Variable(tf.random_normal([1])) # Bias
cell = tf.contrib.rnn.BasicLSTMCell(Constants.numHidden, forget_bias=Constants.forgetBias)
outputs, finalState = tf.nn.static_rnn(cell, xTensors, dtype=tf.float32)
# predictions = [tf.add(tf.matmul(output, W), b) for output in outputs] # List of predictions after each time step
prediction = tf.add(tf.matmul(outputs[-1], W), b) # Prediction after final time step
prediction = tf.tanh(prediction) # Activation
mse = tf.losses.mean_squared_error(predictions=prediction, labels=y) # Mean loss over entire batch
accuracy = tf.reduce_mean(1 - (tf.abs(y - prediction) / DataWorker.labelRange)) # Accuracy over entire batch
optimiser = tf.train.AdamOptimizer(Constants.learningRate).minimize(mse) # Backpropagation
with tf.Session() as session:
session.run(tf.global_variables_initializer())
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
dict = {x: batchX, y: batchY}
session.run(optimiser, dict)
if batchNum % 1000 == 0 or epochComplete:
batchLoss = session.run(mse, dict)
batchAccuracy = session.run(accuracy, dict)
print("Iteration:", batchNum)
print(batchLoss)
print(str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
testAccuracy = session.run(accuracy, {x: testX, y: testY})
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
But now, for practicality and readability, I want to implement my model as a class, but have encountered many problems with initializing my variables, etc.
This is the closest I have got to implementing the above example using my own LSTM class
Model.py
import tensorflow as tf
import Constants
import DataWorker # Remove this dependency
class LSTM():
"""docstring."""
def __init__(self,
inputDimensionList,
outputDimensionList,
numLayers=Constants.numLayers,
numHidden=Constants.numHidden,
learningRate=Constants.learningRate,
forgetBias=Constants.forgetBias
):
"""docstring."""
self.batchInputs = tf.placeholder(tf.float32, [None] + inputDimensionList)
self.batchLabels = tf.placeholder(tf.float32, [None] + outputDimensionList)
self.weightedMatrix = tf.Variable(tf.random_normal([numHidden] + outputDimensionList))
self.biasMatrix = tf.Variable(tf.random_normal(outputDimensionList))
self.cell = tf.contrib.rnn.BasicLSTMCell(numHidden, forget_bias=forgetBias)
self.numLayers = numLayers
self.numHidden = numHidden
self.learningRate = learningRate
self.forgetBias = forgetBias
self.batchDict = {}
self.batchInputTensors = None
self.batchOutputs = None # All needed as instance variables?
self.batchFinalStates = None
self.batchPredictions = None
self.batchLoss = None
self.batchAccuracy = None
self.initialised = False
self.session = tf.Session()
# Take in activation, loss and optimiser FUNCTIONS as args
def execute(self, command):
"""docstring."""
return self.session.run(command, self.batchDict)
def setBatchDict(self, inputs, labels):
"""docstring."""
self.batchDict = {self.batchInputs: inputs, self.batchLabels: labels}
self.batchInputTensors = tf.unstack(self.batchInputs, axis=1)
def processBatch(self):
"""docstring."""
self.batchOutputs, self.batchFinalState = tf.nn.static_rnn(self.cell, self.batchInputTensors, dtype=tf.float32)
pred = tf.tanh(tf.add(tf.matmul(self.batchOutputs[-1], self.weightedMatrix), self.biasMatrix))
mse = tf.losses.mean_squared_error(predictions=pred, labels=self.batchLabels)
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
if not self.initialised:
self.session.run(tf.global_variables_initializer())
self.initialised = True
with tf.variable_scope("model") as scope:
if self.initialised:
scope.reuse_variables()
self.execute(optimiser)
self.batchPredictions = self.execute(pred)
self.batchLoss = self.execute(tf.losses.mean_squared_error(predictions=self.batchPredictions, labels=self.batchLabels))
self.batchAccuracy = self.execute(tf.reduce_mean(1 - (tf.abs(self.batchLabels - self.batchPredictions) / DataWorker.labelRange)))
return self.batchPredictions, self.batchLabels, self.batchLoss, self.batchAccuracy
def kill(self):
"""docstring."""
self.session.close()
This class is quite messy, especially processBatch() as I have just been trying to get it to work before refining it.
I then run my model here:
Main.py
import DataWorker
import Constants
from Model import LSTM
inputDim = [Constants.sequenceLength, DataWorker.numFeatures]
outputDim = [1]
lstm = LSTM(inputDimensionList=inputDim, outputDimensionList=outputDim)
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
lstm.setBatchDict(batchX, batchY)
batchPredictions, batchLabels, batchLoss, batchAccuracy = lstm.runBatch()
if batchNum % 1000 == 0 or epochComplete:
print("Iteration:", batchNum)
print("Pred:", batchPredictions[-1], "\tLabel:", batchLabels[-1])
print("Loss:", batchLoss)
print("Accuracy:", str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
lstm.setBatchDict(testX, testY)
_, _, _, testAccuracy = lstm.runBatch()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
lstm.kill()
A single passthrough of the graph is executed fine, when all the variables are initialized, but it is on the second iteration where I get the error
ValueError: Variable rnn/basic_lstm_cell/kernel/Adam/ already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
I Googled this problem and learned that using scope.reuse_variables() should stop it trying to initialize the AdamOptimizer a second time, but cleary this isn't working how I have implemented it. How can I fix this issue?
As a side note, is my method of creating the TensorFlow session as an instance variable within my LSTM class acceptable, or should I create the session in Main and then pass it into the LSTM instance?
In general I wrap anything that creates variables under the hood with tf.make_template when doing object oriented model building.
However, you should avoid adding ops to the graph in a training loop, which looks like it's happening here. They will build up and cause problems, and likely give you incorrect results. Instead, define the graph (with inputs from tf.data, placeholders, or queues) and only loop over a session.run call. Even better, structure your code as an Estimator and this will be enforced.
show codes:
cluster = tf.train.ClusterSpec({"ps": self.train_args['ps_hosts'], "worker": self.train_args['worker_hosts']})
server = tf.train.Server(cluster, job_name=self.job_name, task_index=self.task_index)
if self.job_name == "ps":
log.info('server is running..............')
server.join()
else:
log.info('worker: {} is running..............'.format(self.task_index))
is_chief = self.task_index == 0
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:{}".format(self.task_index), cluster=cluster)):
self.cnn = self.init_model_class()
self.global_step = tf.Variable(0, name="global_step", trainable=False)
lr = tf.train.exponential_decay(learning_rate=0.005, global_step=self.global_step,
decay_steps=self.train_args['lr_decay_steps'],
decay_rate=0.96, staircase=True, name='learn_rate')
optimizer = tf.train.AdamOptimizer(lr)
grads_and_vars = optimizer.compute_gradients(self.cnn.loss)
if self.train_args['is_sync']:
rep_op = tf.train.SyncReplicasOptimizer(optimizer, replicas_to_aggregate=len(self.train_args['worker_hosts']),
total_num_replicas=len(self.train_args['worker_hosts']),
use_locking=False)
self.train_op = rep_op.apply_gradients(grads_and_vars, global_step=self.global_step)
if is_chief:
init_token_op = rep_op.get_init_tokens_op()
chief_queue_runner = rep_op.get_chief_queue_runner()
else:
self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)
saver = tf.train.Saver(tf.global_variables(), save_relative_paths=True, max_to_keep=10)
all_summary_op = tf.summary.merge_all()
init_op = tf.global_variables_initializer()
if not os.path.exists(self.checkpoint_dir):
os.makedirs(self.checkpoint_dir)
sv = tf.train.Supervisor(is_chief=(self.task_index == 0),
logdir=self.checkpoint_dir,
init_op=init_op,
summary_op=all_summary_op,
saver=saver,
global_step=self.global_step,
save_model_secs=60)
session_conf = tf.ConfigProto(allow_soft_placement=self.train_args['allow_soft_placement'],
log_device_placement=self.train_args['log_device_placement'],
device_filters=['/job:ps', '/job:worker/task:{}'.format(self.task_index)])
with sv.managed_session(server.target, config=session_conf) as self.sess:
if self.task_index == 0 and self.train_args['is_sync']:
sv.start_queue_runners(self.sess, [chief_queue_runner])
self.sess.run(init_token_op)
batches = data_process.batch_iter_train(self.train_args['batch_size'], self.train_args['num_epochs'])
local_step = 0
for x_batch, y_batch in batches:
local_step += 1
current_step = self.train_step(x_batch, y_batch)
if current_step % self.train_args['evaluate_every'] == 0:
log.info('worker: {} global step: {} local step: {}'.format(self.task_index, current_step, local_step))
x_dev, y_dev = data_process.batch_iter_dev(self.train_args['dev_batch_size'])
self.dev_step(x_dev, y_dev, current_step)
sv.stop()
log.info('****************************finish training***********************************')
I use 4 machines, 3 for worker(same hardware: 24 cpus), 1 for ps
question:
when i use asynchronous distribute, worker 0 and 1 use 50% cpu, but worker 2 just spent 15% cpu, why?
when i use Synchronous distributed, worker 0 blocked after trains 1 step, and other workers blocked too without doing nothing, i try to start worker 1,2 before worker 0, but nothing change, why?
does all checkpoints are stored on worker 0? with my codes, why the checkpoint only store one checkpoint, i set save last 10.
saver = tf.train.Saver(tf.global_variables(), save_relative_paths=True, max_to_keep=10)
how about summary? store on diff machine or just worker 0, i find 3 workers have summary.
anyone can give me some advices, thanks very much! did i use it with wrong way?