optimize ram usage in trainning gru

optimize ram usage in trainning gru - python

Below code is a nested loop for training a gru in python 2.7; but it is a ram consuming one. feats_tensor and dec_padded_text are too large object and loading them at same time make me face out of memory error. any idea how to optimize this code for ram usage?
for epoch in xrange(0, 13):
print ("Starting New Epoch: %d" % epoch)
np.random.shuffle(order)
del feats_tensor, dec_text_tensor
if cuda:
torch.cuda.empty_cache()
feats_tensor = torch.tensor(feats[order], requires_grad=False)
dec_text_tensor = torch.tensor(dec_padded_text[order], requires_grad=False)
if cuda:
feats_tensor = feats_tensor.cuda(device=device)
dec_text_tensor = dec_text_tensor.cuda(device=device)
for i in xrange(num_batches):
s = i * BATCH_SIZE
e = (i+1) * BATCH_SIZE
enc.zero_grad()
dec.zero_grad()
hid_enc = enc.forward(feats_tensor[s:e]).unsqueeze(0)
out_dec, hid_dec = dec.forward(dec_text_tensor[s:e,:-1], hid_enc)
out_perm = out_dec.permute(0, 2, 1)
loss = lossfunc(out_perm, dec_text_tensor[s:e,1:])
if sm_loss is None:
sm_loss = loss.data
else:
sm_loss = sm_loss*0.95 + 0.05*loss.data
loss.backward()
enc_optim.step()
dec_optim.step()
if i % 100 == 0:
print ("Epoch: %.3f" % (i/float(num_batches) + epoch,), "Loss:", sm_loss)
#print ("GEN:", untokenize(torch.argmax(out_dec,dim=2)[0,:], dec_idx_to_word))
#print ("GT:", untokenize(dec_text_tensor[s,:], dec_idx_to_word))
print ("--------------")
save_state(enc, dec, enc_optim, dec_optim, dec_idx_to_word, dec_word_to_idx, epoch)

Related

How to resolve the problem in google collab?

def cw_l2_attack(model, images, labels, targeted=False, c=1e-4, kappa=0, max_iter=1000, learning_rate=0.01) :
images = images.to(device)
labels = labels.to(device)
Define f-function
def f(x) :
outputs = model(x)
one_hot_labels = torch.eye(len(outputs[0]))[labels].to(device)
i, _ = torch.max((1-one_hot_labels)*outputs, dim=1)
j = torch.masked_select(outputs, one_hot_labels.byte())
If targeted, optimize for making the other class most likely
if targeted :
return torch.clamp(i-j, min=-kappa)
If untargeted, optimize for making the other class most likely
else :
return torch.clamp(j-i, min=-kappa)
w = torch.zeros_like(images, requires_grad=True).to(device)
optimizer = optim.Adam([w], lr=learning_rate)
prev = 1e10
for step in range(max_iter) :
a = 1/2*(nn.Tanh()(w) + 1)
loss1 = nn.MSELoss(reduction='sum')(a, images)
loss2 = torch.sum(c*f(a))
cost = loss1 + loss2
optimizer.zero_grad()
cost.backward()
optimizer.step()
Early Stop when loss does not converge.
if step % (max_iter//10) == 0 :
if cost > prev :
print('Attack Stopped due to CONVERGENCE....')
return a
prev = cost
print('- Learning Progress : %2.2f %% ' %((step+1)/max_iter*100), end='\r')
attack_images = 1/2*(nn.Tanh()(w) + 1)
return attack_images
print("Attack Image & Predicted Label")
model.eval()
correct = 0
total = 0
for images, labels in normal_loader:
images = cw_l2_attack(model, images, labels, targeted=False, c=0.1)
labels = labels.to(device)
outputs = model(images)
pre = torch.max(outputs.data, 1)
total += 1
correct += (pre == labels).sum()
imshow(torchvision.utils.make_grid(images.cpu().data, normalize=True), [normal_data.classes[i] for i in pre])
print('Accuracy of test text: %f %%' % (100 * float(correct) / total))
RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)

[Theano]TypeError: cost must be a scalar

I am undergoing a research project that requires me to write a regularizer for a DNN.
import lasagne
from lasagne.nonlinearities import leaky_rectify, softmax
import theano, theano.tensor as T
import numpy as np
import sklearn.datasets, sklearn.preprocessing, sklearn.model_selection
import matplotlib.pyplot as plt
from tabulate import tabulate
import time
import math
#psi function that will be used in the penalty function
def psi(g,l):
m = g.shape[1]
C = (1/T.pow(2,m))*(1/T.pow(math.pi,((m-1)/2))) / (T.gamma((m+1)/2))
logDens = T.log(C) + m*T.log(l) - l*T.sqrt(T.sum(g**2))
dens = T.exp(logDens)
return(dens)
#pstar function that will be used in the penalty function
def pStar(g,lambda1,lambda0,theta):
psi1 = psi(g,lambda1)
psi0 = psi(g,lambda0)
## if a coefficient is really large then both these will numerically be zero
if theta*psi1 ==0 and (1-theta)*psi0==0:
p = 1
else:
p = (theta*psi1) / (theta*psi1 + (1 - theta)*psi0)
return p
#Seperable
def pen_S(l):
theta = 0.5
lambda1 = 1
lambda0 = 12
for j in range(len(l)):
t = l[j]
m = t.shape[1]
n = t.shape[0].eval()
cost = T.zeros((1,1))
for i in range(n):
g = t[i]
temp = -lambda1*T.sum(g**2) + T.log(pStar(T.zeros((1,m)),lambda1,lambda0,theta)/pStar(g,lambda1,lambda0,theta))
cost = cost + temp
return cost
# Number of simulations
N_runs = 1
# Maximum number of epochs
max_epochs = 1500
# Define number of layers and number of neurons
H_layers = np.asarray([40, 20])
# Minibatch size
batch_size = 300
# Lasagne Regularizers to be tested
regularizers = [pen_S]
# Define the regularization factors for each algorithm
reg_factors = [10**-3.5]
# Define the names (for display purposes)
names = ['SSGL_Sep']
# Load the dataset (DIGITS)
digits = sklearn.datasets.load_digits()
X = digits.data
y = digits.target
# MNIST
#mnist = sklearn.datasets.fetch_mldata('MNIST original', data_home='C:/Users/ISPAMM/Downloads')
#X = mnist.data
#y = mnist.target
# Preprocessing (input)
scaler = sklearn.preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)
# Output structures
tr_errors = np.zeros((len(regularizers), N_runs))
tst_errors = np.zeros((len(regularizers), N_runs))
tr_times = np.zeros((len(regularizers), N_runs))
tr_obj = np.zeros((len(regularizers), N_runs, max_epochs))
sparsity_weights = np.zeros((len(regularizers), N_runs, len(H_layers)+1))
sparsity_neurons = np.zeros((len(regularizers), N_runs, len(H_layers)+1))
# Define the input and output symbolic variables
input_var = T.matrix(name='X')
target_var = T.ivector(name='y')
# Utility function for minibatches
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
assert len(inputs) == len(targets)
if shuffle:
indices = np.arange(len(inputs))
np.random.shuffle(indices)
for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield inputs[excerpt], targets[excerpt]
for k in np.arange(0, N_runs):
print("Run ", k+1, " of ", N_runs, "...\n", end="")
# Split the data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.25)
# Define the network structure
network = lasagne.layers.InputLayer((None, X.shape[1]), input_var)
for h in H_layers:
network = lasagne.layers.DenseLayer(network, h, nonlinearity=leaky_rectify, W=lasagne.init.GlorotNormal())
network = lasagne.layers.DenseLayer(network, len(np.unique(y)), nonlinearity=softmax, W=lasagne.init.GlorotNormal())
params_original = lasagne.layers.get_all_param_values(network)
params = lasagne.layers.get_all_params(network, trainable=True)
# Define the loss function
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
# Define the test function
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
dtype=theano.config.floatX)
test_fn = theano.function([input_var, target_var], test_acc, allow_input_downcast=True)
for r in np.arange(0, len(regularizers)):
# Set to original parameters
lasagne.layers.set_all_param_values(network, params_original)
# Define the regularized loss function
loss_reg = loss.mean() + reg_factors[r] * lasagne.regularization.regularize_network_params(network, regularizers[r])
# Update function
# updates_reg = lasagne.updates.nesterov_momentum(loss_reg, params,learning_rate=0.01)
updates_reg = lasagne.updates.adam(loss_reg, params)
# Training function
train_fn = theano.function([input_var, target_var], loss_reg, updates=updates_reg, allow_input_downcast=True)
# Train network
print("\tTraining with ", names[r], " regularization, epoch: ", end="")
start = time.time()
for epoch in range(max_epochs):
loss_epoch = 0
batches = 0
if np.mod(epoch, 10) == 0:
print(epoch, "... ", end="")
for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
input_batch, target_batch = batch
loss_epoch += train_fn(input_batch, target_batch)
batches += 1
tr_obj[r,k,epoch] = loss_epoch/batches
end = time.time()
tr_times[r,k] = end - start
print(epoch, ".")
# Final test with accuracy
print("\tTesting the network with ", names[r], " regularization...")
tr_errors[r,k] = test_fn(X_train, y_train)
tst_errors[r,k] = test_fn(X_test, y_test)
# Check sparsity
params_trained = lasagne.layers.get_all_param_values(network, trainable=True)
sparsity_weights[r,k,:] = [1-(x.round(decimals=3).ravel().nonzero()[0].shape[0]/x.size) for x in params_trained[0::2]]
sparsity_neurons[r,k,:] = [x.round(decimals=3).sum(axis=1).nonzero()[0].shape[0] for x in params_trained[0::2]]
tr_obj_mean = np.mean(tr_obj, axis=1)
# Plot the average loss
plt.figure()
plt.title('Training objective')
for r in np.arange(0, len(regularizers)):
plt.semilogy(tr_obj_mean[r, :], label=names[r])
plt.legend()
# Print the results
print(tabulate([['Tr. accuracy [%]'] + np.mean(tr_errors, axis=1).round(decimals=4).tolist(),
['Test. accuracy [%]'] + np.mean(tst_errors, axis=1).round(decimals=4).tolist(),
['Tr. times [secs.]'] + np.mean(tr_times, axis=1).round(decimals=4).tolist(),
['Sparsity [%]'] + np.mean(sparsity_weights, axis=1).round(decimals=4).tolist(),
['Neurons'] + np.mean(sparsity_neurons, axis=1).round(decimals=4).tolist()],
headers=['']+names))
Here is my defined regularizer pen_S(l), but when I run the code to train the network, i was promted with 'TypeError: cost must be a scalar.' But I think my output of pen_S is already a scalar.
Can anyone help me with this?

Memory keep accumulating while training by PyTorch

I am training a deep learning model using PyTorch. Due to unknown reasons, memory keeps accumulating, which leads to session killed under 30 epochs and underfitting.
Some thoughts here:
Wondering if it's caused by matplotlib so I added plt.close('all'); didn't work
Added gc.collect(); didn't work
Wondering if it's caused by cv2.imwrite(), but don't know how to inspect this. Any suggestions?
PyTorch issues?
others...
model.train()
for epo in range(epoch):
for i, data in enumerate(trainloader, 0):
inputs = data
inputs = Variable(inputs)
optimizer.zero_grad()
top = model.upward(inputs + white(inputs))
outputs = model.downward(top, shortcut = True)
loss = criterion(inputs, outputs)
loss.backward()
optimizer.step()
# Print generated pictures every 100 iters
if i % 100 == 0:
inn = inputs[0].view(128, 128).detach().numpy() * 255
cv2.imwrite("/home/tk/Documents/recover/" + str(epo) + "_" + str(i) + ".png", inn)
out = outputs[0].view(128, 128).detach().numpy() * 255
cv2.imwrite("/home/tk/Documents/recover/" + str(epo) + "_" + str(i) + "_re.png", out)
# Print loss every 50 iters
if i % 50 == 0:
print ('[%d, %5d] loss: %.3f' % (epo, i, loss.item()))
gc.collect()
plt.close("all")
===================================================================
20181222 Update
Datasets & DalaLoader
class MSourceDataSet(Dataset):
def __init__(self, clean_dir):
for i in cleanfolder:
with open(clean_dir + '{}'.format(i)) as f:
clean_list.append(torch.Tensor(json.load(f)))
cleanblock = torch.cat(clean_list, 0)
self.spec = cleanblock
def __len__(self):
return self.spec.shape[0]
def __getitem__(self, index):
spec = self.spec[index]
return spec
trainset = MSourceDataSet(clean_dir)
trainloader = torch.utils.data.DataLoader(dataset = trainset,
batch_size = 4,
shuffle = True)
The model is really complicated and long...plus the memory accumulation issue didn't happen before (using the same model), so I will not post it here...

Speed up network training using Dataset API in tensorflow

I have a small dataset that fits nicely in gpu ram.
My goal is to better utilize my gpu (currently around 70%) and thus decrease training time using the new Dataset API in Tensorflow v1.4.
I would like to increase the utilization score of the gpu without adding more layers or increasing the batch size. How is this possible with the Dataset API?
Below is a simplified example of my current implementation:
import numpy as np
from time import time
import tensorflow as tf
"""
Simple regression example with Dataset API.
The training and val sets are small enought to fit in GPU ram.
"""
TRAIN_SET_SIZE = 130000
VAL_SET_SIZE = 30000
TRAIN_BATCH_SIZE = 100
VAL_BATCH_SIZE = 1000
TRAIN_PREFETCH = 200
VAL_PREFETCH = 1
INPUT_FEATURES = 120
LAYERS = [500, 500, 500, 500, 1] # last layer size should be 1
def fc_layer(in_tensor, in_dim, out_dim, name, act_fun=tf.nn.relu):
with tf.variable_scope(name):
sd = 1.0 / np.sqrt(in_dim)
W_fc = tf.Variable(tf.truncated_normal([in_dim, out_dim], stddev=sd), name='weights')
b_fc = tf.Variable(tf.truncated_normal([out_dim], stddev=sd), name='bias')
z_fc = tf.matmul(in_tensor, W_fc) + b_fc
if act_fun is None:
return z_fc
else:
return act_fun(z_fc)
# Create dummy data
train_set_x = np.random.uniform(low=-1, high=1, size=(TRAIN_SET_SIZE, INPUT_FEATURES)).astype(np.float32)
train_set_y = np.random.uniform(low=-1, high=2, size=(TRAIN_SET_SIZE)).astype(np.float32)
val_set_x = np.random.uniform(low=-1, high=1, size=(VAL_SET_SIZE, INPUT_FEATURES)).astype(np.float32)
val_set_y = np.random.uniform(low=-1, high=2, size=(VAL_SET_SIZE)).astype(np.float32)
# Reset graph
tf.reset_default_graph()
with tf.device('/gpu:0'):
# Dummy train data
train_set = tf.data.Dataset.from_tensor_slices((train_set_x, train_set_y))
# TODO First batch and then prefetch or inverse the order?
# TODO TRAIN_PREFETCH value?
train_set = train_set.shuffle(buffer_size=1000).batch(TRAIN_BATCH_SIZE).prefetch(TRAIN_PREFETCH)
# Dummy val data
val_set = tf.data.Dataset.from_tensor_slices((val_set_x, val_set_y))
# TODO VAL_PREFETCH value?
val_set = val_set.batch(VAL_BATCH_SIZE).prefetch(VAL_PREFETCH)
# Iterator
iterator = tf.data.Iterator.from_structure(train_set.output_types, train_set.output_shapes)
train_init_op = iterator.make_initializer(train_set)
val_init_op = iterator.make_initializer(val_set)
x, truth = iterator.get_next()
# Build graph
activations = []
activations.append(fc_layer(x,
INPUT_FEATURES,
LAYERS[0],
name='fc0'))
for layer_ix in range(1, len(LAYERS) - 1):
activations.append(fc_layer(activations[-1],
LAYERS[layer_ix - 1],
LAYERS[layer_ix],
name='fc' + str(layer_ix)))
activations.append(fc_layer(activations[-1],
LAYERS[-2],
LAYERS[-1],
act_fun=None,
name='fc' + str(len(LAYERS) - 1)))
prediction = activations[-1]
loss = tf.reduce_mean(tf.square(truth - prediction))
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_step = optimizer.minimize(loss, global_step=global_step, name='train_step')
sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True),
log_device_placement=True,
allow_soft_placement=True))
sess.run(tf.global_variables_initializer())
for e in range(1, 6): # epochs
epoch_start_time = time()
# Train set
sess.run(train_init_op)
print('\nTrain init op time: %.4f' % (time() - epoch_start_time))
while True:
try:
batch_start_time = time()
batch_loss, step, _ = sess.run([loss, global_step, train_step])
# if step % 1000 == 0:
# print('Step: %5d Loss: %.2f, Batch Time : %.5f sec' % (step, batch_loss, time() - batch_start_time))
except tf.errors.OutOfRangeError:
break
# print('Epoch time (without computing val set loss): %.2f' % (time() - epoch_start_time))
# Val set
sess.run(val_init_op)
pred_err = np.ndarray([VAL_SET_SIZE])
ix = 0
while True:
try:
p, t = sess.run([prediction, truth])
pred_err[ix:ix + VAL_BATCH_SIZE] = p.reshape([-1]) - t
ix += VAL_BATCH_SIZE
except tf.errors.OutOfRangeError:
val_loss = np.mean(pred_err ** 2)
print('Epoch: %2d, Loss: %.2f, Epoch time: %.2f sec' % (e, val_loss, time() - epoch_start_time))
break

LSTM setting/resetting the state when using a variable batch size

I have built this LSTM class:
import tensorflow as tf
import Constants
class LSTM():
def __init__(self,
inputShape,
outputShape,
numLayers=Constants.numLayers,
numHidden=Constants.numHidden,
learningRate=Constants.learningRate,
forgetBias=Constants.forgetBias):
self.inputs = tf.placeholder(tf.float32, [None] + inputShape)
self.labels = tf.placeholder(tf.float32, [None] + outputShape)
self.inputTensors = tf.unstack(self.inputs, axis=1)
self.weights = tf.Variable(tf.random_normal([numHidden] + outputShape))
self.bias = tf.Variable(tf.random_normal(outputShape))
layers = [tf.contrib.rnn.LSTMCell(numHidden, forget_bias=forgetBias, state_is_tuple=True)] * numLayers
self.cell = tf.contrib.rnn.MultiRNNCell(layers, state_is_tuple=True)
self.optimiser = tf.train.GradientDescentOptimizer(learningRate)
self.forgetBias = forgetBias
self.batchDict = None
self.outputs = None
self.finalStates = None
self.predictions = None
self.loss = None
self.accuracy = None
self.optimise = None
self.session = tf.Session()
self.__buildGraph()
def __buildGraph(self):
outputs, finalStates = tf.nn.static_rnn(self.cell, self.inputTensors, dtype=tf.float32)
predictions = tf.add(tf.matmul(outputs[-1], self.weights), self.bias)
self.predictions = tf.minimum(tf.maximum(predictions, 0), 1)
self.loss = tf.losses.mean_squared_error(predictions=self.predictions, labels=self.labels)
self.accuracy = tf.reduce_mean(1 - tf.abs(self.labels - self.predictions) / 1.0)
self.optimise = self.optimiser.minimize(self.loss)
self.session.run(tf.global_variables_initializer())
def __execute(self, operation):
return self.session.run(operation, self.batchDict)
def setBatch(self, inputs, labels):
self.batchDict = {self.inputs: inputs, self.labels: labels}
def batchLabels(self):
return self.__execute(self.labels)
def batchPredictions(self):
return self.__execute(self.predictions)
def batchLoss(self):
return self.__execute(self.loss)
def batchAccuracy(self):
return self.__execute(self.accuracy)
def processBatch(self):
self.__execute(self.optimise)
def kill(self):
self.session.close()
and I run it like so:
import DataWorker
import Constants
from Model import LSTM
inputShape = [Constants.sequenceLength, DataWorker.numFeatures]
outputShape = [1]
LSTM = LSTM(inputShape, outputShape)
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer)
LSTM.setBatch(batchX, batchY)
LSTM.processBatch()
if batchNum % Constants.printStep == 0 or epochComplete:
print("Batch:\t\t", batchNum)
print("Last Pred:\t", LSTM.batchPredictions()[-1][0])
print("Last Label:\t", LSTM.batchLabels()[-1][0])
print("Loss:\t\t", LSTM.batchLoss())
print("Accuracy:\t", str("%.2f" % (LSTM.batchAccuracy() * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY = DataWorker.generateTestBatch()
LSTM.setBatchDict(testX, testY)
testAccuracy = LSTM.batchAccuracy()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
LSTM.kill()
This all works well as it should. However, I am using time series data which consists of financial stocks spanning over ranges of timestamps far greater than the number of time steps that my LSTM is unrolled for - Constants.sequenceLength. Because of this, it takes many sequential batches for a single stock t be processed, and so the state/memory of my LSTM needs to be passed between batches. As well as this, after a batch that completes the lifespan of an ID, the next batch would be passing in a new ID from the initial timestamp of my dataset, and so I would want to reset the memory.
There are many questions asking something similar, and all of the answers are adequate, however, none seem to address the issue of using variable batch sizes - batch sizes initialised to None and then inferred when a batch is passed in. My batches are usually a constant size, but do change under certain circumstances and I cannot change this. How can I have control over passing the state between batches, as well as resetting the state, if I have not specified the batch size?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

optimize ram usage in trainning gru - python

Related

How to resolve the problem in google collab?

[Theano]TypeError: cost must be a scalar

Memory keep accumulating while training by PyTorch

Speed up network training using Dataset API in tensorflow

LSTM setting/resetting the state when using a variable batch size

Categories

Resources