Dropout layer directly in tensorflow: how to train? - python

After I created my model in Keras, I want to get the gradients and apply them directly in Tensorflow with the tf.train.AdamOptimizer class. However, since I am using a Dropout layer, I don't know how to tell to the model whether it is in the training mode or not. The training keyword is not accepted. This is the code:
net_input = Input(shape=(1,))
net_1 = Dense(50)
net_2 = ReLU()
net_3 = Dropout(0.5)
net = Model(net_input, net_3(net_2(net_1(net_input))))
#mycost = ...
optimizer = tf.train.AdamOptimizer()
gradients = optimizer.compute_gradients(mycost, var_list=[net.trainable_weights])
# perform some operations on the gradients
# gradients = ...
trainstep = optimizer.apply_gradients(gradients)
I get the same behavior with and without dropout layer, even with dropout rate=1. How to solve this?

As #Sharky already said you can use training argument while invoking call() method of Dropout class. However, if you want to train in tensorflow graph mode you need to pass a placeholder and feed it boolean value during training. Here is the example of fitting Gaussian blobs applicable to your case:
import tensorflow as tf
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import ReLU
from tensorflow.keras.layers import Input
from tensorflow.keras import Model
x_train, y_train = make_blobs(n_samples=10,
n_features=2,
centers=[[1, 1], [-1, -1]],
cluster_std=1)
x_train, x_test, y_train, y_test = train_test_split(
x_train, y_train, test_size=0.2)
# `istrain` indicates whether it is inference or training
istrain = tf.placeholder(tf.bool, shape=())
y = tf.placeholder(tf.int32, shape=(None))
net_input = Input(shape=(2,))
net_1 = Dense(2)
net_2 = Dense(2)
net_3 = Dropout(0.5)
net = Model(net_input, net_3(net_2(net_1(net_input)), training=istrain))
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y, logits=net.output)
loss_fn = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(0.01)
grads_and_vars = optimizer.compute_gradients(loss_fn,
var_list=[net.trainable_variables])
trainstep = optimizer.apply_gradients(grads_and_vars)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
l1 = loss_fn.eval({net_input:x_train,
y:y_train,
istrain:True}) # apply dropout
print(l1) # 1.6264652
l2 = loss_fn.eval({net_input:x_train,
y:y_train,
istrain:False}) # no dropout
print(l2) # 1.5676715
sess.run(trainstep, feed_dict={net_input:x_train,
y:y_train,
istrain:True}) # train with dropout

Keras layers inherit from tf.keras.layers.Layer class. Keras API handle this internally with model.fit. In case Keras Dropout is used with pure TensorFlow training loop, it supports a training argument in its call function.
So you can control it with
dropout = tf.keras.layers.Dropout(rate, noise_shape, seed)(prev_layer, training=is_training)
From official TF docs
Note: - The following optional keyword arguments are reserved for
specific uses: * training: Boolean scalar tensor of Python boolean
indicating whether the call is meant for training or inference. *
mask: Boolean input mask. - If the layer's call method takes a mask
argument (as some Keras layers do), its default value will be set to
the mask generated for inputs by the previous layer (if input did come
from a layer that generated a corresponding mask, i.e. if it came from
a Keras layer with masking support.
https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dropout#call

Related

skopt's gp_minimize() function raises ValueError: array must not contain infs or NaNs

I am currently using the skopt (scikit-optimize) package for hyperparameter tuning of a neural network (I am trying to minimize -1* accuracy). It seems to run fine (and successfully prints to the console) for several iterations before it raises Value Error: array must not contain infs or NaNs.
What are some possible causes of this? My data does not contain infs or NaNs and neither do my search parameter ranges. The neural network code is quite long, so for brevity, I will paste the relevant sections:
Imports:
import pandas as pd
import numpy as np
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer
from tensorflow.python.framework import ops
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras import backend as K
Creation of search parameters:
dim_num_filters_L1 = Integer(low=1, high=50, name='num_filters_L1')
#dim_kernel_size_L1 = Integer(low=1, high=70, name='kernel_size_L1')
dim_activation_L1 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L1')
dim_num_filters_L2 = Integer(low=1, high=50, name='num_filters_L2')
#dim_kernel_size_L2 = Integer(low=1, high=70, name='kernel_size_L2')
dim_activation_L2 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L2')
dim_num_dense_nodes = Integer(low=1, high=28, name='num_dense_nodes')
dim_activation_L3 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L3')
dim_dropout_rate = Real(low = 0, high = 0.5, name = 'dropout_rate')
dim_learning_rate = Real(low=1e-4, high=1e-2, name='learning_rate')
dimensions = [dim_num_filters_L1,
#dim_kernel_size_L1,
dim_activation_L1,
dim_num_filters_L2,
#dim_kernel_size_L2,
dim_activation_L2,
dim_num_dense_nodes,
dim_activation_L3,
dim_dropout_rate,
dim_learning_rate,
]
Function that creates all models that will be tested:
def create_model(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate):
input_shape = (X_train.shape[1], 1)
model = Sequential()
model.add(Conv1D(num_filters_L1, kernel_size = 40, activation = activation_L1, input_shape = input_shape))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(num_filters_L2, kernel_size=20, activation=activation_L2))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(num_dense_nodes, activation = activation_L3))
model.add(Dropout(dropout_rate))
model.add(Dense(y_train.shape[1], activation='linear'))
adam = tensorflow.keras.optimizers.Adam(learning_rate = learning_rate)
model.compile(optimizer=adam, loss='mean_squared_error', metrics=['accuracy'])
return model
Define fitness function:
#use_named_args(dimensions=dimensions)
def fitness(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate):
model = create_model(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate)
history_opt = model.fit(x=X_train,
y=y_train,
validation_data=(X_val,y_val),
shuffle=True,
verbose=2,
epochs=10
)
#return the validation accuracy for the last epoch.
accuracy_opt = model.evaluate(X_test,y_test)[1]
# Print the classification accuracy:
print("Experimental Model Accuracy: {0:.2%}".format(accuracy_opt))
# Delete the Keras model with these hyper-parameters from memory:
del model
# Clear the Keras session, otherwise it will keep adding new models to the same TensorFlow graph each time we create model with a different set of hyper-parameters.
K.clear_session()
ops.reset_default_graph()
# the optimizer aims for the lowest score, so return negative accuracy:
return -accuracy # or sum(RMSE)?
Run hyperparameter search:
gp_result = gp_minimize(func=fitness,
dimensions=dimensions)
print("best accuracy was " + str(round(gp_result.fun *-100,2))+"%.")
Your activation function is not converging in a random acquisition function call. I encountered this problem and removed 'relu' function from search space.

keras rl - dqn model update

I am reading through the DQN implementation in keras-rl /rl/agents/dqn.py and see that in the compile() step essentially 3 keras models are instantiated:
self.model : provides q value predictions
self.trainable_model : same as self.model but has the loss function we want to train
self.target_model : target model which provides the q targets and is updated every k steps with the weights from self.model
The only model on which train_on_batch() is called is trainable_model, however - and this is what I don't understand - this also updates the weights of model.
In the definition of trainable_model one of the output tensors y_pred is referencing the output from model:
y_pred = self.model.output
y_true = Input(name='y_true', shape=(self.nb_actions,))
mask = Input(name='mask', shape=(self.nb_actions,))
loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_true, y_pred, mask])
ins = [self.model.input] if type(self.model.input) is not list else self.model.input
trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred])
When trainable_model.train_on_batch() is called, BOTH the weights in trainable_model and in model change. I am surprised because even though the two models reference the same output tensor object (trainable_model.y_pred = model.output), the instantiation of trainable_model = Model(...) should also instantiate a new set of weights, no?
Thanks for the help!
This is a small example to show that when you instantiate a new keras.models.Model() using the input and output tensor from another model, then the weights of these two models are shared. They don't get re-initialized.
# keras version: 2.2.4
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import SGD
np.random.seed(123)
model1 = Sequential()
model1.add(Dense(1, input_dim=1, activation="linear", name="model1_dense1", weights=[np.array([[10]]),np.array([10])]))
model1.compile(optimizer=SGD(), loss="mse")
model2 = Model(inputs=model1.input, outputs=model1.output)
model2.compile(optimizer=SGD(), loss="mse")
x = np.random.normal(size=2000)
y = 2 * x + np.random.normal(size=2000)
print("model 1 weights", model1.get_weights())
print("model 2 weights", model2.get_weights())
model2.fit(x,y, epochs=3, batch_size=32)
print("model 1 weights", model1.get_weights())
print("model 2 weights", model2.get_weights())
Definitely something to keep in mind. Wasnt intuitive to me.

How to get tf.gradients from keras API model?

I would like to know how to get tf.gradients from a model built using the Keras API.
import Tensorflow as tf
from tensorflow import keras
from sklearn.datasets.samples_generator import make_blobs
# Create the model
inputs = keras.Input(shape=(2,))
x = keras.layers.Dense(12, activation='relu')(inputs)
x = keras.layers.Dense(8, activation='relu')(x)
predictions = keras.layers.Dense(3, activation='softmax')(x)
model = keras.models.Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
loss='categorical_crossentropy',
metrics=['accuracy'])
# Generate random data
X, y = make_blobs(n_samples=1000, centers=3, n_features=2)
labels = keras.utils.to_categorical(y, num_classes=3)
# Compute the gradients wrt inputs
y_true = tf.convert_to_tensor(labels)
y_pred = tf.convert_to_tensor(np.round(model.predict(X)))
sess = tf.Session()
sess.run(tf.global_variables_initializer())
grads = tf.gradients(model.loss_functions[0](y_true, y_pred),
model.inputs[0])
sess.run(grads, input_dict={model.inputs[0]: X, model.outputs: y})
First attempt above: my grads are None. With my second try below:
sess.run(grads, input_dict={model.inputs: X, model.outputs: y })
I get the following error:
TypeError: unhashable type: 'list'
I think you shouldn't create a new session directly with Tensorflow when using Keras. Instead, it is better to use the session implicitly created by Keras:
import keras.backend as K
sess = K.get_session()
However, I think in this case you don't need to retrieve the session at all. You can easily use backend functions, like K.gradients() and K.function(), to achieve your aim.

TensorFlow how to make results reproducible for `tf.nn.sampled_softmax_loss`

I would like to get reproducible results for my tensorflow runs. The way I'm trying to make this happen is to set up the numpy and tensorflow seeds:
import numpy as np
rnd_seed = 1
np.random.seed(rnd_seed)
import tensorflow as tf
tf.set_random_seed(rnd_seed)
As well as make sure that the weights of the neural network, that I initialized with tf.truncated_normal also use that seed: tf.truncated_normal(..., seed=rnd_seed)
For reasons that are beyond the scope of this question, I'm using the sampled softmax loss function, tf.nn.sampled_softmax_loss, and unfortunately, I'm not able to control the stochasticity of this function with a random seed.
By a look at the TensorFlow documentation of this function (https://www.tensorflow.org/api_docs/python/tf/nn/sampled_softmax_loss), I can see that parameter sampled_values should be the only parameter that affects randomization, but I'm not able to understand how to actually use a seed.
[EDITED]
This is (part of) my script
import numpy as np
# set a seed so that the results are consistent
rnd_seed = 1
np.random.seed(rnd_seed)
import tensorflow as tf
tf.set_random_seed(rnd_seed)
embeddings_ini = np.random.uniform(low=-1, high=1, size=(self.vocabulary_size, self.embedding_size))
with graph.as_default(), tf.device('/cpu:0'):
train_dataset = tf.placeholder(tf.int32, shape=[None, None])
train_labels = tf.placeholder(tf.int32, shape=[None, 1])
valid_dataset = tf.constant(self.valid_examples, dtype=tf.int32)
# Variables.
initial_embeddings = tf.placeholder(tf.float32, shape=(self.vocabulary_size, self.embedding_size))
embeddings = tf.Variable(initial_embeddings)
softmax_weights = tf.Variable(
tf.truncated_normal([self.vocabulary_size, self.embedding_size],
stddev=1.0 / math.sqrt(self.embedding_size), seed=rnd_seed))
softmax_biases = tf.Variable(tf.zeros([self.vocabulary_size]))
# Model.
# Look up embeddings for inputs.
if self.model == "skipgrams":
# Skipgram model
embed = tf.nn.embedding_lookup(embeddings, train_dataset)
elif self.model == "cbow":
# CBOW Model
embeds = tf.nn.embedding_lookup(embeddings, train_dataset)
embed = tf.reduce_mean(embeds, 1, keep_dims=False)
# Compute the softmax loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=softmax_weights,
biases=softmax_biases,
inputs=embed,
labels=train_labels,
num_sampled=self.num_sampled,
num_classes=self.vocabulary_size))
I finally, found out how to make results reproducible. Just like #Anis suggested I should've set the graph seed and this can be done by:
with graph.as_default(), tf.device('/cpu:0'):
tf.set_random_seed(1234)

Make predictions using a tensorflow graph from a keras model

I have a model trained using Keras with Tensorflow as my backend, but now I need to turn my model into a tensorflow graph for a certain application. I attempted to do this and make predictions to insure that it is working correctly, but when comparing to the results gathered from model.predict() I get very different values. For instance:
from keras.models import load_model
import tensorflow as tf
model = load_model('model_file.h5')
x_placeholder = tf.placeholder(tf.float32, shape=(None,7214,1))
y = model(x_placeholder)
x = np.ones((1,7214,1))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print("Predictions from:\ntf graph: "+str(sess.run(y, feed_dict={x_placeholder:x})))
print("keras predict: "+str(model.predict(x)))
returns:
Predictions from:
tf graph: [[-0.1015993 0.07432419 0.0592984 ]]
keras predict: [[ 0.39339241 0.57949686 -3.67846966]]
The values from keras predict are correct, but the tf graph results are not.
If it helps to know the final intended application, I am creating a jacobian matrix with the tf.gradients() function, but currently it does not return the correct results when comparing to theano's jacobian function, which gives the correct jacobian. Here is my tensorflow jacobian code:
x = tf.placeholder(tf.float32, shape=(None,7214,1))
y = tf.reshape(model(x)[0],[-1])
y_list = tf.unstack(y)
jacobian_list = [tf.gradients(y_, x)[0] for y_ in y_list]
jacobian = tf.stack(jacobian_list)
EDIT: Model code
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, InputLayer, Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
# activation function used following every layer except for the output layers
activation = 'relu'
# model weight initializer
initializer = 'he_normal'
# shape of input data that is fed into the input layer
input_shape = (None,7214,1)
# number of filters used in the convolutional layers
num_filters = [4,16]
# length of the filters in the convolutional layers
filter_length = 8
# length of the maxpooling window
pool_length = 4
# number of nodes in each of the hidden fully connected layers
num_hidden_nodes = [256,128]
# number of samples fed into model at once during training
batch_size = 64
# maximum number of interations for model training
max_epochs = 30
# initial learning rate for optimization algorithm
lr = 0.0007
# exponential decay rate for the 1st moment estimates for optimization algorithm
beta_1 = 0.9
# exponential decay rate for the 2nd moment estimates for optimization algorithm
beta_2 = 0.999
# a small constant for numerical stability for optimization algorithm
optimizer_epsilon = 1e-08
model = Sequential([
InputLayer(batch_input_shape=input_shape),
Conv1D(kernel_initializer=initializer, activation=activation, padding="same", filters=num_filters[0], kernel_size=filter_length),
Conv1D(kernel_initializer=initializer, activation=activation, padding="same", filters=num_filters[1], kernel_size=filter_length),
MaxPooling1D(pool_size=pool_length),
Flatten(),
Dense(units=num_hidden_nodes[0], kernel_initializer=initializer, activation=activation),
Dense(units=num_hidden_nodes[1], kernel_initializer=initializer, activation=activation),
Dense(units=3, activation="linear", input_dim=num_hidden_nodes[1]),
])
# compile model
loss_function = mean squared error
early_stopping_min_delta = 0.0001
early_stopping_patience = 4
reduce_lr_factor = 0.5
reuce_lr_epsilon = 0.0009
reduce_lr_patience = 2
reduce_lr_min = 0.00008
optimizer = Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=optimizer_epsilon, decay=0.0)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=early_stopping_min_delta,
patience=early_stopping_patience, verbose=2, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, epsilon=reuce_lr_epsilon,
patience=reduce_lr_patience, min_lr=reduce_lr_min, mode='min', verbose=2)
model.compile(optimizer=optimizer, loss=loss_function)
model.fit(train_x, train_y, validation_data=(cv_x, cv_y),
epochs=max_epochs, batch_size=batch_size, verbose=2,
callbacks=[reduce_lr,early_stopping])
model.save('model_file.h5')
#frankyjuang linked me to here
https://github.com/amir-abdi/keras_to_tensorflow
and combining this with code from
https://github.com/metaflow-ai/blog/blob/master/tf-freeze/load.py
and
https://github.com/tensorflow/tensorflow/issues/675
I have found a solution to both predicting using a tf graph and creating the jacobian function:
import tensorflow as tf
import numpy as np
# Create function to convert saved keras model to tensorflow graph
def convert_to_pb(weight_file,input_fld='',output_fld=''):
import os
import os.path as osp
from tensorflow.python.framework import graph_util
from tensorflow.python.framework import graph_io
from keras.models import load_model
from keras import backend as K
# weight_file is a .h5 keras model file
output_node_names_of_input_network = ["pred0"]
output_node_names_of_final_network = 'output_node'
# change filename to a .pb tensorflow file
output_graph_name = weight_file[:-2]+'pb'
weight_file_path = osp.join(input_fld, weight_file)
net_model = load_model(weight_file_path)
num_output = len(output_node_names_of_input_network)
pred = [None]*num_output
pred_node_names = [None]*num_output
for i in range(num_output):
pred_node_names[i] = output_node_names_of_final_network+str(i)
pred[i] = tf.identity(net_model.output[i], name=pred_node_names[i])
sess = K.get_session()
constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), pred_node_names)
graph_io.write_graph(constant_graph, output_fld, output_graph_name, as_text=False)
print('saved the constant graph (ready for inference) at: ', osp.join(output_fld, output_graph_name))
return output_fld+output_graph_name
Call:
tf_model_path = convert_to_pb('model_file.h5','/model_dir/','/model_dir/')
Create function to load the tf model as a graph:
def load_graph(frozen_graph_filename):
# We load the protobuf file from the disk and parse it to retrieve the
# unserialized graph_def
with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
# Then, we can use again a convenient built-in function to import a graph_def into the
# current default Graph
with tf.Graph().as_default() as graph:
tf.import_graph_def(
graph_def,
input_map=None,
return_elements=None,
name="prefix",
op_dict=None,
producer_op_list=None
)
input_name = graph.get_operations()[0].name+':0'
output_name = graph.get_operations()[-1].name+':0'
return graph, input_name, output_name
Create a function to make model predictions using the tf graph
def predict(model_path, input_data):
# load tf graph
tf_model,tf_input,tf_output = load_graph(model_path)
# Create tensors for model input and output
x = tf_model.get_tensor_by_name(tf_input)
y = tf_model.get_tensor_by_name(tf_output)
# Number of model outputs
num_outputs = y.shape.as_list()[0]
predictions = np.zeros((input_data.shape[0],num_outputs))
for i in range(input_data.shape[0]):
with tf.Session(graph=tf_model) as sess:
y_out = sess.run(y, feed_dict={x: input_data[i:i+1]})
predictions[i] = y_out
return predictions
Make predictions:
tf_predictions = predict(tf_model_path,test_data)
Jacobian function:
def compute_jacobian(model_path,input_data):
tf_model,tf_input,tf_output = load_graph(model_path)
x = tf_model.get_tensor_by_name(tf_input)
y = tf_model.get_tensor_by_name(tf_output)
y_list = tf.unstack(y)
num_outputs = y.shape.as_list()[0]
jacobian = np.zeros((num_outputs,input_data.shape[0],input_data.shape[1]))
for i in range(input_data.shape[0]):
with tf.Session(graph=tf_model) as sess:
y_out = sess.run([tf.gradients(y_, x)[0] for y_ in y_list], feed_dict={x: input_data[i:i+1]})
jac_temp = np.asarray(y_out)
jacobian[:,i:i+1,:]=jac_temp[:,:,:,0]
return jacobian
Compute Jacobian Matrix:
jacobians = compute_jacobian(tf_model_path,test_data)

Categories

Resources