I am trying to incriment the per_process_gpu_memory_fraction value in my tf.GPUOptions() and then change the Keras session with set_session() however, the memory fraction never actually changes. After the first run of the while loop, 319MB is reserved as shown in nvidia-smi, which
a) never gets released when clear_session() is called, and
b) doesn't go up on the next iteration of the while loop.
import GPUtil
import time
import tensorflow as tf
import numpy as np
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
def model_trainer():
y_pred = None
errors = 0
total_ram = GPUtil.getGPUs()[0].memoryTotal
total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
mem_amount = 0.005 # intentionally allocated a small amount so it needs to
# increment the mem_amount
x_train = np.empty((10000, 100))
y_train = np.random.randint(0, 9, size=10000)
y_train = to_categorical(y_train, 10)
while y_pred is None:
print("mem", mem_amount)
if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
config = tf.ConfigProto(
intra_op_parallelism_threads=2,
inter_op_parallelism_threads=2,
gpu_options=gpu_options)
sess = tf.Session(config=config)
set_session(sess)
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
try:
print(sess)
model.fit(x_train, y_train, epochs=5, batch_size=32)
y_pred = model.predict(x_train)
except (ResourceExhaustedError, UnknownError) as e:
if mem_amount > 1.0:
raise ValueError('model too large for vram')
else:
mem_amount += 0.05
clear_session()
errors += 1
pass
else:
clear_session()
if __name__ == "__main__":
model_trainer()
The puzzling thing is that Keras willingly takes the new session (as shown by a get_session() call), but won't apply the new GPUOptions.
In addition to the example above I have tried doing:
clear_session()
del model
clear_session()
del model
gc.collect()
None of this has worked in releasing the VRAM.
My overall goal is to use "trial and error" until the process has enough VRAM to train on, as there seems to be no good way of figuring out how much VRAM is needed for a Keras model without just running it, so that I can run multiple models in parallel on a single GPU. When the ResourceExhaustedError occurs, I want to release the VRAM that is held by Keras and then try again with a larger amount of VRAM. Is there any way to accomplish this?
After searching for a while, I found that Tensorflow will only take VRAM, and will never release it until it dies, even if del model, clear_session() is used. I also tried the method displayed here (https://github.com/keras-team/keras/issues/9379), which uses:
from keras import backend as K
K.clear_session()
from numba import cuda
cuda.select_device(0)
cuda.close()
This resulted in an error for me as when Tensorflow tried to access the GPU again, its pointer to the memory space was invalid (as it was killed with cuda.close()). Thus the only way around it is to use processes, and not threads (tried that too, same issue as before).
The other thing I found is that while there are methods to try to estimate the amount of VRAM a Keras model will use, it is not a very accurate way of doing it. (see: How to determine needed memory of Keras model?) I also tried computing directly from the Keras layers and that varied wildly, so that wasn't accurate either. So that really only leaves you to do trial an error by catching the ResourceExhaustedError and trying again.
Below is my code for running multiple different Keras model on a single GPU.
import GPUtil
import time
import multiprocessing
import tensorflow as tf
import numpy as np
from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from tensorflow.python.framework.errors_impl import ResourceExhaustedError, UnknownError
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
def model_trainer():
mem_amount = 0.05
x_train = np.empty((100000, 100))
y_train = np.random.randint(0, 9, size=100000)
y_train = to_categorical(y_train, 10)
manager = multiprocessing.Manager()
return_dict = manager.dict()
def worker(mem_amount, return_dict):
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_amount)
config = tf.ConfigProto(
intra_op_parallelism_threads=2,
inter_op_parallelism_threads=2,
gpu_options=gpu_options)
sess = tf.Session(config=config)
set_session(sess)
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=1024, activation='relu'))
model.add(Dense(units=2048, activation='relu'))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
try:
get_session()
model.fit(x_train, y_train, epochs=5, batch_size=1000)
return_dict["valid"] = True
except (ResourceExhaustedError, UnknownError) as e:
return
while "valid" not in list(return_dict.keys()):
print("mem", mem_amount)
total_ram = GPUtil.getGPUs()[0].memoryTotal
total_ram_allowed = GPUtil.getGPUs()[0].memoryTotal * 0.90
# can add in a for loop to have multiple models
if total_ram_allowed > total_ram * mem_amount and GPUtil.getGPUs()[0].memoryFree > total_ram * mem_amount:
p = multiprocessing.Process(target=worker, args=(mem_amount, return_dict))
p.start()
p.join()
print(return_dict.values())
if "valid" not in list(return_dict.keys()):
if mem_amount > 1.0:
raise ValueError('model too large for vram')
else:
mem_amount += 0.05
else:
break
else:
time.sleep(10)
if __name__ == "__main__":
model_trainer()
Related
I am currently using the skopt (scikit-optimize) package for hyperparameter tuning of a neural network (I am trying to minimize -1* accuracy). It seems to run fine (and successfully prints to the console) for several iterations before it raises Value Error: array must not contain infs or NaNs.
What are some possible causes of this? My data does not contain infs or NaNs and neither do my search parameter ranges. The neural network code is quite long, so for brevity, I will paste the relevant sections:
Imports:
import pandas as pd
import numpy as np
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer
from tensorflow.python.framework import ops
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras import backend as K
Creation of search parameters:
dim_num_filters_L1 = Integer(low=1, high=50, name='num_filters_L1')
#dim_kernel_size_L1 = Integer(low=1, high=70, name='kernel_size_L1')
dim_activation_L1 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L1')
dim_num_filters_L2 = Integer(low=1, high=50, name='num_filters_L2')
#dim_kernel_size_L2 = Integer(low=1, high=70, name='kernel_size_L2')
dim_activation_L2 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L2')
dim_num_dense_nodes = Integer(low=1, high=28, name='num_dense_nodes')
dim_activation_L3 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L3')
dim_dropout_rate = Real(low = 0, high = 0.5, name = 'dropout_rate')
dim_learning_rate = Real(low=1e-4, high=1e-2, name='learning_rate')
dimensions = [dim_num_filters_L1,
#dim_kernel_size_L1,
dim_activation_L1,
dim_num_filters_L2,
#dim_kernel_size_L2,
dim_activation_L2,
dim_num_dense_nodes,
dim_activation_L3,
dim_dropout_rate,
dim_learning_rate,
]
Function that creates all models that will be tested:
def create_model(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate):
input_shape = (X_train.shape[1], 1)
model = Sequential()
model.add(Conv1D(num_filters_L1, kernel_size = 40, activation = activation_L1, input_shape = input_shape))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(num_filters_L2, kernel_size=20, activation=activation_L2))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(num_dense_nodes, activation = activation_L3))
model.add(Dropout(dropout_rate))
model.add(Dense(y_train.shape[1], activation='linear'))
adam = tensorflow.keras.optimizers.Adam(learning_rate = learning_rate)
model.compile(optimizer=adam, loss='mean_squared_error', metrics=['accuracy'])
return model
Define fitness function:
#use_named_args(dimensions=dimensions)
def fitness(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate):
model = create_model(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate)
history_opt = model.fit(x=X_train,
y=y_train,
validation_data=(X_val,y_val),
shuffle=True,
verbose=2,
epochs=10
)
#return the validation accuracy for the last epoch.
accuracy_opt = model.evaluate(X_test,y_test)[1]
# Print the classification accuracy:
print("Experimental Model Accuracy: {0:.2%}".format(accuracy_opt))
# Delete the Keras model with these hyper-parameters from memory:
del model
# Clear the Keras session, otherwise it will keep adding new models to the same TensorFlow graph each time we create model with a different set of hyper-parameters.
K.clear_session()
ops.reset_default_graph()
# the optimizer aims for the lowest score, so return negative accuracy:
return -accuracy # or sum(RMSE)?
Run hyperparameter search:
gp_result = gp_minimize(func=fitness,
dimensions=dimensions)
print("best accuracy was " + str(round(gp_result.fun *-100,2))+"%.")
Your activation function is not converging in a random acquisition function call. I encountered this problem and removed 'relu' function from search space.
When the model is taking sufficiently long to infer (i.e. enough parameters and data big enough), and when profile_batch is on, the TensorBoard callback fails to write the training metrics to the log events (at least they are not visible in Tensorboard).
Here is the code used to get that failure:
import os.path as op
import time
import numpy as np
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Conv2D, Input
from tensorflow.keras.models import Model
size = 512
im = Input((size, size, 1))
im_conv = Conv2D(512, 3, padding='same', activation='relu')(im)
im_conv = Conv2D(1, 3, padding='same', activation='linear')(im_conv)
model = Model(im, im_conv)
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
data = np.random.rand(1, size, size, 1)
run_id = f'{int(time.time())}'
log_dir = op.join('logs', run_id)
tboard_cback = TensorBoard(
log_dir=log_dir,
histogram_freq=0,
write_graph=False,
write_images=False,
profile_batch=2,
)
model.fit(
x=data,
y=data,
validation_data=[data, data],
callbacks=[tboard_cback,],
epochs=100,
verbose=0,
);
Here is the Tensorboard viz I have:
Is there something wrong with the way I am using this callback?
I use Python 3.6.8, tensorflow 2.0.0 on GPU (but the behaviour is the same on CPU).
So apparently, this is due to the profiling done in the callback. We can disable it via profile_batch=0. The issue is ongoing and to be followed here: https://github.com/tensorflow/tensorboard/issues/2084
I have a large dataset 5GB that I want to use for training a neural network model designed using Keras. Although I am using Nvidia Tesla P100 GPU, the training is really slow (each epoch takes ~ 60-70s) (I choose the batch size=10000). After reading and searching, I found out that I can improve the training speed by using keras fit_generator instead of the typical fit. To do so, I coded the following:
from __future__ import print_function
import numpy as np
from keras import Sequential
from keras.layers import Dense
import keras
from sklearn.model_selection import train_test_split
def generator(C, r, batch_size):
samples_per_epoch = C.shape[0]
number_of_batches = samples_per_epoch / batch_size
counter = 0
while 1:
X_batch = np.array(C[batch_size * counter:batch_size * (counter + 1)])
y_batch = np.array(r[batch_size * counter:batch_size * (counter + 1)])
counter += 1
yield X_batch, y_batch
# restart counter to yeild data in the next epoch as well
if counter >= number_of_batches:
counter = 0
if __name__ == "__main__":
X, y = readDatasetFromFile()
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=.2)
model = Sequential()
model.add(Dense(16, input_dim=X.shape[1]))
model.add(keras.layers.advanced_activations.PReLU())
model.add(Dense(16))
model.add(keras.layers.advanced_activations.PReLU())
model.add(Dense(16))
model.add(keras.layers.advanced_activations.PReLU())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 1000
model.fit_generator(generator(X_tr, y_tr, batch_size), epochs=200, steps_per_epoch=X.shape[0]/ batch_size,
validation_data=generator(X_ts, y_ts, batch_size * 2),
validation_steps=X.shape[0] / batch_size * 2, verbose=2, use_multiprocessing=True)
loss, accuracy = model.evaluate(X_ts, y_ts, verbose=0)
print(loss, accuracy)
After running with fit_generator, the training time improved a little bit but it is still slow (each epoch now takes ~ 40-50s). When running nvidia-smi in the terminal, I found out that GPU utilization is ~15% only which makes me wonder if my code is wrong. I am posting my code above to kindly ask you if there is a bug causing to slow the performance of GPU.
Thank you,
Just try assigning GPUs forcefully so:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" # or if you want more than 1 GPU set it as "0", "1"
I'm training a series of models in a for loop - to test a certain architecture. While doing so, I run out of memory and the system shuts down the process.
The same problem appears in this question and this question. To try their solutions, I did a test run with a similar loop to the one that is giving me problems. The code is:
def mem_test(n):
train_data = np.random.rand(1000,1500)
train_labels = np.random.randint(2,size= 1000)
mem = []
for i in range(n):
model = keras.Sequential([keras.layers.Dense(1000, activation= tf.nn.relu),
keras.layers.Dense(2,activation = tf.nn.softmax)])
model.compile(optimizer= tf.train.AdamOptimizer(.001), loss = 'sparse_categorical_crossentropy',
metrics = ['accuracy'])
model.fit(train_data,train_labels, epochs = 1)
mem.append(psutil.virtual_memory())
return mem
def mem_test_clear(n):
train_data = np.random.rand(1000,1500)
train_labels = np.random.randint(2,size= 1000)
mem = []
for i in range(n):
model = keras.Sequential([keras.layers.Dense(1000, activation= tf.nn.relu),
keras.layers.Dense(2,activation = tf.nn.softmax)])
model.compile(optimizer= tf.train.AdamOptimizer(.001), loss = 'sparse_categorical_crossentropy',
metrics = ['accuracy'])
model.fit(train_data,train_labels, epochs = 1)
mem.append(psutil.virtual_memory())
keras.backend.clear_session()
tf.reset_default_graph()
return mem
while the latter seems to do slightly better than the former, they both still end up accumulating memory usage. So, for my actual application of this, I'm left without a solution. What do I need to do in order to actually free up memory in this situation? What am I doing wrong?
You have to compile only once the model.
Then you can build a loop for fitting it:
import numpy as np
import psutil
import keras
import tensorflow as tf
def mem_test(n):
train_data = np.random.rand(1000,1500)
train_labels = np.random.randint(2,size= 1000)
mem = []
model = keras.Sequential([keras.layers.Dense(1000, activation= tf.nn.relu),
keras.layers.Dense(2,activation = tf.nn.softmax)])
model.compile(optimizer= tf.train.AdamOptimizer(.001), loss = 'sparse_categorical_crossentropy',
metrics = ['accuracy'])
for i in range(n):
model.fit(train_data,train_labels, epochs = 1)
mem.append(psutil.virtual_memory())
return mem
mem_test(50)
This way it will consume just a tiny amount of memory and will not accumulate anything. Furthermore this is the way how your model will work correctly.
I am running a keras script (no direct call to theano in my script) and I get the following error:
TypeError: ('An update must have the same type as the original shared
variable (shared_var=<TensorType(float32, matrix)>,
shared_var.type=TensorType(float32, matrix),
update_val=Elemwise{add,no_inplace}.0,
update_val.type=TensorType(float64, matrix)).',
'If the difference is related to the broadcast pattern,
you can call the tensor.unbroadcast(var, axis_to_unbroadcast[, ...])
function to remove broadcastable dimensions.')
I have seen the error from folks running theano directly, but not through keras. Not sure what I should do, since I am not dealing with tensors directly.
the problem was that there is a change in keras version (I am currently using keras 0.3.2 with theano 0.8.0) and what used to be fine does not work well with he new keras version.
The following was the original code, and see the fix below.
from keras.models import Sequential
import keras.optimizers
from keras.layers.core import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Activation
from keras.optimizers import SGD, Adam
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, RegressorMixin
class NnRegression(BaseEstimator, RegressorMixin):
def __init__(self, apply_standart_scaling=True,
dropx=[0.2, 0.5, 0.5], nb_neuronx=[50, 30], nb_epoch=105, validation_split=0.,
verbose=1):
self.apply_standart_scaling = apply_standart_scaling
self.dropx = dropx
self.nb_neuronx = nb_neuronx
self.nb_epoch = nb_epoch
self.validation_split = validation_split
self.verbose = verbose
def fit(self, X, y):
nb_features = X.shape[1]
self.standart_scaling = StandardScaler() if self.apply_standart_scaling else None
if self.standart_scaling:
X = self.standart_scaling.fit_transform(X)
model = Sequential()
model.add(Dropout(input_shape = (nb_features,),p= self.dropx[0]))
model.add(Dense(output_dim = self.nb_neuronx[0], init='glorot_uniform'))
model.add(PReLU())
model.add(BatchNormalization(self.nb_neuronx[0],)))
model.add(Dropout(self.dropx[1]))
model.add(Dense(self.nb_neuronx[1], init='glorot_uniform'))
model.add(PReLU())
model.add(BatchNormalization(self.nb_neuronx[0],)))
model.add(Dropout(self.dropx[2]))
model.add(Dense(1, init='glorot_uniform'))
nn_verbose = 1 if self.verbose>0 else 0
optz = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=Adam(),loss='mse')
model.fit(X, y, batch_size=16,
nb_epoch=self.nb_epoch, validation_split=self.validation_split, verbose=nn_verbose)
self.model = model
def predict(self, X):
if self.standart_scaling:
X = self.standart_scaling.transform(X)
return self.model.predict_proba(X, verbose=0)
well, it turns out that the problem is this single line of code:
model.add(BatchNormalization(self.nb_neuronx[0],)))
It should actually be:
model.add(BatchNormalization())
because the number of neurons has no business within the normalization layer (however this did not bother in a previous keras version).
This apparently causes theano to generate new weights that are not float32 but float64, and that triggers the message above.