How to use BERT for performing regression? - python

Here is the code and dataset
This is the function used for building the model:
def build_regression_model():
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
encoder_inputs = preprocessing_layer(text_input)
encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
outputs = encoder(encoder_inputs)
net = outputs['pooled_output']
net = tf.keras.layers.Dense(units=1)(net)
return tf.keras.Model(text_input, net)
regression_model = build_regression_model()
The loss is:
loss='mean_absolute_error'
The optimizer is:
optimizer=tf.optimizers.Adam(learning_rate=0.1)
Can someone please provide tips to solve this problem. Thank you!

Related

Preprocessing Layers and KerasTuner Cooperation

I'm having troubles making the Preprocessing layers and the Keras Tuner cooperate.
I am referring to this tutorial Load CSV data | TensorFlow Core for the Preprocessing part, and to the Getting started with KerasTuner documentation for the Keras Tuner part.
Briefly, here's the code.
He loads the data:
titanic = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_features = titanic.copy()
titanic_labels = titanic_features.pop('survived')
He creates the symbolic tensors of the features in a dictionary
inputs = {}
for name, column in titanic_features.items():
dtype = column.dtype
if dtype == object:
dtype = tf.string
else:
dtype = tf.float32
inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
inputs
Then he applies normalization on the numerical features:
numeric_inputs = {name:input for name,input in inputs.items()
if input.dtype==tf.float32}
x = layers.Concatenate()(list(numeric_inputs.values()))
norm = layers.Normalization()
norm.adapt(np.array(titanic[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)
all_numeric_inputs
He creates a list
preprocessed_inputs = [all_numeric_inputs]
He one hot encodes the categorical features:
for name, input in inputs.items():
if input.dtype == tf.float32:
continue
lookup = layers.StringLookup(vocabulary=np.unique(titanic_features[name]))
one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())
x = lookup(input)
x = one_hot(x)
preprocessed_inputs.append(x)
and then He concatenates it:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)
Now what I want to do is insert this preprocessing part in KerasTuner. I tried this:
def titanic_model(units, activation):
model_inputs = tf.keras.Input(shape=28)
dense_1 = layers.Dense(units=units, activation=activation)(model_inputs)
dense_output = layers.Dense(1)(dense_1)
body = tf.keras.Model(inputs = model_inputs, outputs = dense_output)
return body
def build_model(hp,preprocessing_head, inputs):
units = hp.Int("units", min_value=32, max_value=512, step=32)
activation = hp.Choice("activation", ["relu", "tanh"])
preprocessed_inputs = preprocessing_head(inputs)
result = titanic_model(units,
activation)(preprocessed_inputs)
model = tf.keras.Model(inputs, result)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(),
metrics = ['accuracy'])
return model
titanic_model = build_model(keras_tuner.HyperParameters(),titanic_preprocessing, inputs)
but it gives me the following error:
Inputs to a layer should be tensors. Got: <keras_tuner.engine.hyperparameters.HyperParameters
object at 0x7ff52844da30>
I cannot understand if I am close to the solution, or this is not the right way to proceed.
However, the workaround i found was to insert directly in the build_model function the preprocessing layer (titanic_preprocessing) and the inputs dictionary, without passing it as an argument of the function.
Hence:
def titanic_model(units, activation):
model_inputs = tf.keras.Input(shape=28)
dense_1 = layers.Dense(units=units, activation=activation)(model_inputs)
dense_output = layers.Dense(1)(dense_1)
body = tf.keras.Model(inputs = model_inputs, outputs = dense_output)
return body
def build_model(hp):
units = hp.Int("units", min_value=32, max_value=512, step=32)
activation = hp.Choice("activation", ["relu", "tanh"])
preprocessed_inputs = titanic_preprocessing(inputs)
result = titanic_model(units,
activation)(preprocessed_inputs)
model = tf.keras.Model(inputs, result)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(),
metrics = ['accuracy'])
return model
titanic_model = build_model()
In this case it seems to work, and by setting the tuner
tuner = keras_tuner.RandomSearch(
hypermodel = build_model,
objective=keras_tuner.Objective("accuracy", direction="max"),
max_trials = 1,
overwrite = True,
directory = "tuner_dir",
project_name = "regression_tuner")
and searching it works:
tuner.search(x=titanic_features_dict, y=titanic_labels, epochs=10)
However, I have doubts about this solution and would appreciate feedback on this.
Thank you!

BERT Encoder layer is non-trainable

I am trying to fine-tune a BERT model from TensorFlow hub. I loaded the preprocessing layer and the encoder as follow :
bert_preprocess_model = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3')
bert_model = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1')
And this is my model definition :
def build_classifier_model():
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessing_layer = hub.KerasLayer(bert_preprocess_model, name='preprocessing')
encoder_inputs = preprocessing_layer(text_input)
encoder = hub.KerasLayer(bert_model, trainable=True, name='BERT_encoder')
outputs = encoder(encoder_inputs)
net = outputs['pooled_output']
net = tf.keras.layers.Dropout(0.1)(net)
net = tf.keras.layers.Dense(3, activation='softmax', name='classifier')(net)
return tf.keras.Model(text_input, net)
classifier_model = build_classifier_model()
But I get the following error : ERROR:absl:hub.KerasLayer is trainable but has zero trainable weights.
In the official website, the model is fine-tunable.
I found the solution, simply add trainable = True :
bert_model = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',trainable=True)

Re-implementing TF 1.0 sampled_softmax_loss funtion for seq2seq model in to TF 2 Keras model

I have a TF 1.0.1 code of seq2seq model. I am trying to rewrite it using Tensorflow Keras.
TF 1.0.1 code has following decoder architecure:
with tf.variable_scope("decoder_scope") as decoder_scope:
# output projection
# we need to specify output projection manually, because sampled softmax needs to have access to the the projection matrix
output_projection_w_t = tf.get_variable("output_projection_w", [vocabulary_size, state_size], dtype=DTYPE)
output_projection_w = tf.transpose(output_projection_w_t)
output_projection_b = tf.get_variable("output_projection_b", [vocabulary_size], dtype=DTYPE)
decoder_cell = tf.contrib.rnn.LSTMCell(num_units=state_size)
decoder_cell = DtypeDropoutWrapper(cell=decoder_cell, output_keep_prob=tf_keep_probabiltiy, dtype=DTYPE)
decoder_cell = contrib_rnn.MultiRNNCell(cells=[decoder_cell] * num_lstm_layers, state_is_tuple=True)
# define decoder train netowrk
decoder_outputs_tr, _ , _ = dynamic_rnn_decoder(
cell=decoder_cell,
decoder_fn= simple_decoder_fn_train(last_encoder_state, name=None),
inputs=decoder_inputs,
sequence_length=decoder_sequence_lengths,
parallel_iterations=None,
swap_memory=False,
time_major=False)
# define decoder inference network
decoder_scope.reuse_variables()
Here is how the sampled_softmax_loss is calculated:
decoder_forward_outputs = tf.reshape(decoder_outputs_tr,[-1, state_size])
decoder_target_labels = tf.reshape(decoder_labels ,[-1, 1]) #decoder_labels is target sequnce of decoder
sampled_softmax_losses = tf.nn.sampled_softmax_loss(
weights = output_projection_w_t,
biases = output_projection_b,
inputs = decoder_forward_outputs,
labels = decoder_target_labels ,
num_sampled = 500,
num_classes=vocabulary_size,
num_true = 1,
)
total_loss_op = tf.reduce_mean(sampled_softmax_losses)
And, this is my decoder in Keras:
decoder_inputs = tf.keras.Input(shape=(None,), name='decoder_input')
emb_layer = tf.keras.layers.Embedding(vocabulary_size, state_size)
x_d = emb_layer(decoder_inputs)
d_lstm_layer = tf.keras.layers.LSTM(embed_dim, return_sequences=True)
d_lstm_out = d_lstm_layer(x_d, initial_state=encoder_states)
This is my sampled_softmax_loss function I use for Keras model:
class SampledSoftmaxLoss(object):
def __init__(self, model):
self.model = model
output_layer = model.layers[-1]
self.input = output_layer.input
self.weights = output_layer.weights
def loss(self, y_true, y_pred, **kwargs):
loss = tf.nn.sampled_softmax_loss(
weights=self.weights[0],
biases=self.weights[1],
labels=tf.reshape(y_true ,[-1, 1]),
inputs=tf.reshape(d_lstm_out,[-1, state_size]),
num_sampled = 500,
num_classes = vocabulary_size
)
But, it does not work.
Can anyone help me to implement sampled_loss_funtion in Keras correctly.

Restore keras seq2seq model

I'm working with the keras seq2seq example here:
https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py
I would like to persist the vocabulary and decoder so that I can load it again later, and apply it to new sequences.
While the code calls model.save(), this is insufficient because I can see the decoding setup referencing a number of other variables which are deep pointers into the trained model:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
I would like to translate this code to determine encoder_inputs, encoder_states, latent_dim, decoder_inputs from a model loaded from disk. It's ok to assume I know the model architecture in advance. Is there a straightforward way to do this?
Update:
I have made some progress using the decoder construction code and pulling out the layer inputs/outputs as needed.
encoder_inputs = model.input[0] #input_1
decoder_inputs = model.input[1] #input_2
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output # lstm_1
_, state_h_dec, state_c_dec = model.layers[3].output # lstm_2
decoder_outputs = model.layers[4].output # dense_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)
latent_dim = 256 # TODO: infer this from the model. Should match lstm_1 outputs.
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_states = [state_h_dec, state_c_dec]
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
However, when I try to construct the decoder model, I encounter this error:
RuntimeError: Graph disconnected: cannot obtain value for tensor Tensor("input_1:0", shape=(?, ?, 96), dtype=float32) at layer "input_1". The following previous layers were accessed without issue: []
As a test I tried Model(decoder_inputs,decoder_outputs) with the same result. It's not clear to me what is disconnected from the graph, since these layers are loaded from the model.
Ok, I solved this problem and the decoder is producing reasonable results. In my code above I missed a couple details in the decoder step, specifically that it call()s the LSTM and Dense layers in order to wire them up. In addition, the new decoder inputs need unique names so they don't collide with input_1 and input_2 (this detail smells like a keras bug).
encoder_inputs = model.input[0] #input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)
decoder_inputs = model.input[1] #input_2
decoder_state_input_h = Input(shape=(latent_dim,),name='input_3')
decoder_state_input_c = Input(shape=(latent_dim,),name='input_4')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs=decoder_dense(decoder_outputs)
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
A big drawback with this code is the fact we know the full architecture in advance. I would like to eventually be able to load an architecture-agnostic decoder.
At a point in the code of the Keras seq2seq example you will have a finished encoder and decoder model. You can save the architecture and weights of these models to disk and load them later. The following works for me:
Save the models to disk:
with open('encoder_model.json', 'w', encoding='utf8') as f:
f.write(encoder_model.to_json())
encoder_model.save_weights('encoder_model_weights.h5')
with open('decoder_model.json', 'w', encoding='utf8') as f:
f.write(decoder_model.to_json())
decoder_model.save_weights('decoder_model_weights.h5')
Later load the encoder and decoder:
def load_model(model_filename, model_weights_filename):
with open(model_filename, 'r', encoding='utf8') as f:
model = model_from_json(f.read())
model.load_weights(model_weights_filename)
return model
encoder = load_model('encoder_model.json', 'encoder_model_weights.h5')
decoder = load_model('decoder_model.json', 'decoder_model_weights.h5')
During prediction you will also need a number of other data, like number of encoder/decoder tokens, dictionaries mapping char to index etc. You can just save these to file after training and load them later, just like with the models.

Saving and restoring Keras BLSTM CTC model

I have been working on speech emotion recognition deep neural network. I have used keras Bidirectional LSTM with CTC loss. i trained the model and saved it
model_json = model.to_json()
with open("ctc_model.json", "w") as json_file:
json_file.write(model_json)
model.save_weights("ctc_weights.h5")
The problem is i can not use this model to test on on unseen data because the model accepts 4 argument as input and calculates the ctc loss..just build the model and train. so how can i save a model in such away that in require only one input. not the labels, and length. Basically how can i save a model as this function test_func = K.function([net_input], [output])
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
shift = 2
y_pred = y_pred[:, shift:, :]
input_length -= shift
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def build_model(nb_feat, nb_class, optimizer='Adadelta'):
net_input = Input(name="the_input", shape=(200, nb_feat))
forward_lstm1 = LSTM(output_dim=64,
return_sequences=True,
activation="tanh"
)(net_input)
backward_lstm1 = LSTM(output_dim=64,
return_sequences=True,
activation="tanh",
go_backwards=True
)(net_input)
blstm_output1 = Merge(mode='concat')([forward_lstm1, backward_lstm1])
forward_lstm2 = LSTM(output_dim=64,
return_sequences=True,
activation="tanh"
)(blstm_output1)
backward_lstm2 = LSTM(output_dim=64,
return_sequences=True,
activation="tanh",
go_backwards=True
)(blstm_output1)
blstm_output2 = Merge(mode='concat')([forward_lstm2, backward_lstm2])
hidden = TimeDistributed(Dense(512, activation='tanh'))(blstm_output2)
output = TimeDistributed(Dense(nb_class + 1, activation='softmax')) (hidden)
labels = Input(name='the_labels', shape=[1], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name="ctc")([output, labels, input_length, label_length])
model = Model(input=[net_input, labels, input_length, label_length], output=[loss_out])
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer, metrics=[])
test_func = K.function([net_input], [output])
return model, test_func
model, test_func = build_model(nb_feat=nb_feat, nb_class=nb_class, optimizer=optimizer)
for epoch in range(number_epoches):
inputs_train = {'the_input': X_train[i:i+batch_size],
'the_labels': y_train[i:i+batch_size],
'input_length': np.sum(X_train_mask[i:i+batch_size], axis=1, dtype=np.int32),
'label_length': np.squeeze(y_train_mask[i:i+batch_size]),
}
outputs_train = {'ctc': np.zeros([inputs_train["the_labels"].shape[0]])}
ctcloss = model.train_on_batch(x=inputs_train, y=outputs_train)
total_ctcloss += ctcloss * inputs_train["the_input"].shape[0] * 1.
loss_train[epoch] = total_ctcloss / X_train.shape[0]
Here is the my model summary
Try the following solution:
import keras.backend as K
def get_prediction_function(model):
input_tensor = model.layers[0].input
output_tensor = model.layers[-5].output
net_function = K.function([input_tensor, K.learning_phase()], [output_tensor])
def _result_function(x):
return net_function([x, 0])[0]
return _result_function
Now your network function might be obtained by:
test_function = get_prediction_function(model)

Categories

Resources