Tensorflow gradients are None ('No gradients provided for any variable')

Tensorflow gradients are None ('No gradients provided for any variable') - python

When I run the following script I get the error message 'No gradients provided for any variable'. The 'grads' variable is a list of 'None' values. What is possibly going wrong in such a simple script?
import tensorflow as tf
import numpy as np
tf.enable_eager_execution()
class Model(tf.keras.Model):
def __init__(self):
super(Model, self).__init__()
self.layer = tf.keras.layers.Dense(4, activation = "linear")
def call(self, x):
y = self.layer(x)
return y
model = Model()
model._set_inputs(tf.zeros((1, 5)))
optimizer = tf.train.GradientDescentOptimizer(0.5)
# gibberish data
x_train = np.array([[0, 0, 0, 0, 1]], dtype=np.float32)
y_train = np.array([[0.1, 0.1, 0.4, 0.4]])
y_pred = model.call(x_train)
with tf.GradientTape() as tape:
loss = tf.losses.mean_squared_error(y_train, y_pred)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))

The model prediction line
y_pred = model.call(x_train)
has to be inside the with tf.GradientTape() as tape: scope.

Related

Slicing Error when using custom cost function in Keras

I have example data. This data is used to set the predictor (x) and response (y) variables:
x has an Age_Years feature and y has Age_Years and target features:
I have a custom cost function that accepts multiple parameters:
# this gives the y_pred values
def calc_prob(param1, param2, param3, age):
prob = (((100*param1*pow((100/param3),-(pow((age/param2),param1))))*pow(age/param2,param1)*math.log(100/param3))/age)/100
return prob
# this serves as the custom cost function
def brier_score(y_pred, y_true):
prob = calc_prob(y_pred[:, 0], y_pred[:, 1], y_pred[:, 2], y_true['Age_Years'])
brier_score = tf.reduce_mean((prob - y_true['target']) ** 2, axis=1)
return brier_score
Ultimately, I want an output that builds a 3-parameter model that minimizes the brier_score() function. If I try to build and run the model, I get an error:
from keras import models
from keras import layers
def build_model():
model = models.Sequential()
model.add(layers.Dense(1, activation='relu', input_shape=(x.shape[1],)))
model.add(layers.Dense(5, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
model.compile(loss=losses, optimizer='adam', metrics=['accuracy'])
return model
model = build_model()
losses=[brier_score]
model.compile(loss=losses, optimizer='adam', metrics=['accuracy'])
# this line causes the error
model.fit(x=x, y=y, epochs=100, verbose=0)
ValueError: slice index 2 of dimension 1 out of bounds. for '{{node brier_score/strided_slice_2}} = StridedSlice[Index=DT_INT32, T=DT_FLOAT, begin_mask=1, ellipsis_mask=0, end_mask=1, new_axis_mask=0, shrink_axis_mask=2](Cast, brier_score/strided_slice_2/stack, brier_score/strided_slice_2/stack_1, brier_score/strided_slice_2/stack_2)' with input shapes: [?,2], [2], [2], [2] and with computed input tensors: input[1] = <0 2>, input[2] = <0 3>, input[3] = <1 1>.

You are mixing up the order of y_true and y_pred in brier_score. Here is a working example:
import tensorflow as tf
from keras import models
from keras import layers
import pandas as pd
import numpy as np
x = np.random.random((500, 1))
y = {'Age_Years': np.squeeze(x, axis=1), 'target': np.random.randint(2, size=500)}
def calc_prob(param1, param2, param3, age):
prob = (((100*param1*tf.math.pow((100/param3),-(tf.math.pow((age/param2),param1))))*tf.math.pow(age/param2,param1)*tf.math.log(100/param3))/age)/100
return prob
# this serves as the custom cost function
def brier_score(y_true, y_pred):
prob = calc_prob(y_pred[:, 0], y_pred[:, 1], y_pred[:, 2], tf.cast(y_true['Age_Years'], dtype=tf.float32))
brier_score = tf.reduce_mean((prob - tf.cast(y_true['target'], dtype=tf.float32)) ** 2, axis=-1, keepdims=True)
return brier_score
def build_model():
model = models.Sequential()
model.add(layers.Dense(1, activation='relu', input_shape=(x.shape[1],)))
model.add(layers.Dense(5, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
return model
model = build_model()
optimizer = tf.keras.optimizers.Adam()
batch_size = 10
dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size)
epochs = 2
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch,))
for step, (x_batch_train, y_batch_train) in enumerate(dataset):
with tf.GradientTape() as tape:
logits = model(x_batch_train, training=True)
loss_value = brier_score(y_batch_train, logits)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))

How do I add noise to the weights when calculating the loss with Keras?

I am new to Keras and am trying to customize my training step in Keras.
Quesions:
How to create the new variable weights_right using weights_right=weights- (lr+alpha)*gradients in Keras when customizing training loop?
How to feedforward the NN with weights as formal parameters? Could I customize the forward function in Keras like the code in the following below?
Background:
In the stochastic gradient descent algorithm, after feedforwarding a mini-batch data and getting the gradients on this mini-batch data, I would like to perturb the weights and create a new variable called weights_right weights_righ t= weights-(lr+alpha)*gradients (alpha is a const) and then feedforward the NN with weights_right to get the new loss.
Some parts of code in python are the following:
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.weights = [np.random.randn(y,1) for y in sizes[1:]]
self.biases = [np.random.randn(y,x) for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a, weights=None, biases=None):
"""Return the output of the network if ``a`` is input."""
if weights is None:
weights=self.weights
if biases is None:
biases=self.biases
#!!! Note the output layer has no activation for regression.
for b, w in zip(biases[:-1], weights[:-1]):
a = sigmoid(np.dot(w, a)+b)
a=np.dot(weights[-1],a)+biases[-1]
return a
#-----------------------------------
# The following is the important one.
#-----------------------------------
def customSGD():
for epoch in range(epochs):
random.shuffle(training_data)
mini_batches= [training_data[k:k+mini_batch_size] for k in range(0, len(training_data), mini_batch_size)]
for mini_batch in mini_batches:
gradients_on_mini_batch = get_gradients(mini_batch)
#---------------------------------------
# The following two steps are what
# I would like to archive in Keras
#---------------------------------------
# Creat new variable called weights_right
weights_right = weights-(lr+alpha)*gradients_on_mini_batch
# feed the NN with weights_right, note that the params
#in current NN are still weights, not weights_right.
pred_right = feedforward(training_data, weights_right)
loss_right = loss_func(pred_right, training_labels)
......
# update weights
weights = weights-lr*gradients_on_mini_batch
Above codes are mainly from the online book Michael Nielsen.
Any help would be appreciated. Thank you so much!

In a custom training loop, you can do whatever you like with the gradients and weights.
#tf.function
def train_step(inputs, labels):
with tf.GradientTape() as tape:
logits = model(inputs)
loss = loss_object(labels, logits)
weights = model.trainable_variables
# add manipulation of weights here
gradients = tape.gradient(loss, weights)
opt.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_acc(labels, logits)
Here's the full running example:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
dataset = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(150)
train_dataset = dataset.take(120).batch(4)
test_dataset = dataset.skip(120).take(30).batch(4)
class DenseModel(Model):
def __init__(self):
super(DenseModel, self).__init__()
self.dens1 = Dense(8, activation='elu')
self.dens2 = Dense(16, activation='relu')
self.dens3 = Dense(3)
def call(self, inputs, training=None, **kwargs):
x = self.dens1(inputs)
x = self.dens2(x)
x = self.dens3(x)
return x
model = DenseModel()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_loss = tf.keras.metrics.Mean()
test_loss = tf.keras.metrics.Mean()
train_acc = tf.keras.metrics.SparseCategoricalAccuracy()
test_acc = tf.keras.metrics.SparseCategoricalAccuracy()
opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
#tf.function
def train_step(inputs, labels):
with tf.GradientTape() as tape:
logits = model(inputs)
loss = loss_object(labels, logits)
weights = model.trainable_variables
# add manipulation of weights here
gradients = tape.gradient(loss, weights)
opt.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_acc(labels, logits)
#tf.function
def test_step(inputs, labels):
logits = model(inputs)
loss = loss_object(labels, logits)
test_loss(loss)
test_acc(labels, logits)
for epoch in range(10):
template = 'Epoch {:>2} Train Loss {:.3f} Test Loss {:.3f} ' \
'Train Acc {:.2f} Test Acc {:.2f}'
train_loss.reset_states()
test_loss.reset_states()
train_acc.reset_states()
test_acc.reset_states()
for X_train, y_train in train_dataset:
train_step(X_train, y_train)
for X_test, y_test in test_dataset:
test_step(X_test, y_test)
print(template.format(
epoch + 1,
train_loss.result(),
test_loss.result(),
train_acc.result(),
test_acc.result()
))

Weighted categorical cross entropy

please I'm trying to build an NLP classifier on top of BERT but I'm struggling with data imbalance. I'm looking for an implementation of weighted CategoricalCrossEntropy. I've already seen a solution using class_weight parameter on fit function but it doesn't "fit" well with my data (I've one hot encoded them and it actually throws an error cause dict element are not matching.
Can someone please give me an implementation from scratch of a WeightedCategoricalCrossEntropy function allowing me me to add weights manually to Tensorflow's native CategoricalCrossEntropy.

The __call__ method of tf.losses.CategoricalCrossentropy accepts three arguments:
y_pred
y_true
sample_weights
And the sample_weight acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If sample_weight is a tensor of size [batch_size], then the total loss for each sample of the batch is rescaled by the corresponding element in the sample_weight vector. You can use it as such:
def compute_loss(model, x, y, training):
out = model(inputs=x, training=training)
sample_weight = tf.random.uniform((tf.shape(x)[0], 1),
minval=0,
maxval=1,
dtype=tf.float32)
loss = loss_object(y_true=y, y_pred=out,
sample_weight=sample_weight)
return loss
These are random values but you can change the values depending on y so it becomes a class weight rather than a sample weight. Here's a full example of a running training loop with custom sample weights:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPooling2D, Dropout
from tensorflow import nn as nn
from functools import partial
(xtrain, ytrain), (xtest, ytest) = tf.keras.datasets.mnist.load_data()
train = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
test = tf.data.Dataset.from_tensor_slices((xtest, ytest))
def prepare(inputs, outputs):
inputs = tf.divide(x=inputs, y=255)
inputs = tf.expand_dims(inputs, -1)
targets = tf.one_hot(indices=outputs, depth=10)
return inputs, targets
train = train.take(5_000).batch(4).map(prepare)
test = test.take(1_000).batch(4).map(prepare)
class MyCNN(K.Model):
def __init__(self):
super(MyCNN, self).__init__()
Conv = partial(Conv2D, kernel_size=(3, 3), activation=nn.relu)
MaxPool = partial(MaxPooling2D, pool_size=(2, 2))
self.conv1 = Conv(filters=8)
self.maxp1 = MaxPool()
self.conv2 = Conv(filters=16)
self.maxp2 = MaxPool()
self.conv3 = Conv(filters=32)
self.maxp3 = MaxPool()
self.flatt = Flatten()
self.dens1 = Dense(64, activation=nn.relu)
self.drop1 = Dropout(.5)
self.dens2 = Dense(10, activation=nn.softmax)
def call(self, x, training=None, **kwargs):
x = self.conv1(x)
x = self.maxp1(x)
x = self.conv2(x)
x = self.maxp2(x)
x = self.conv3(x)
x = self.maxp3(x)
x = self.flatt(x)
x = self.dens1(x)
x = self.drop1(x)
x = self.dens2(x)
return x
model = MyCNN()
loss_object = tf.losses.CategoricalCrossentropy()
def compute_loss(model, x, y, training):
out = model(inputs=x, training=training)
sample_weight = tf.random.uniform((tf.shape(x)[0], 1),
minval=0,
maxval=1,
dtype=tf.float32)
loss = loss_object(y_true=y, y_pred=out, sample_weight=sample_weight)
return loss
def get_grad(model, x, y):
with tf.GradientTape() as tape:
loss = compute_loss(model, x, y, training=False)
return loss, tape.gradient(loss, model.trainable_variables)
optimizer = tf.optimizers.Adam()
verbose = "Epoch {:2d}" \
" Loss: {:.3f} TLoss: {:.3f} Acc: {:.3%} TAcc: {:.3%}"
for epoch in range(1, 10 + 1):
train_loss = tf.metrics.Mean()
train_acc = tf.metrics.CategoricalAccuracy()
test_loss = tf.metrics.Mean()
test_acc = tf.metrics.CategoricalAccuracy()
for x, y in train:
loss_value, grads = get_grad(model, x, y)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
train_loss.update_state(loss_value)
train_acc.update_state(y, model(x, training=True))
for x, y in test:
loss_value, _ = get_grad(model, x, y)
test_loss.update_state(loss_value)
test_acc.update_state(y, model(x, training=False))
print(verbose.format(epoch,
train_loss.result(),
train_acc.result(),
test_loss.result(),
test_acc.result()))

Just to complement the answer , to transform from sample weight to class weight you can do something like this :
First one example without weight:
y_true = [[0, 1, 0], [0, 0, 1]]
y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
# Using 'auto'/'sum_over_batch_size' reduction type.
cce = tf.keras.losses.CategoricalCrossentropy()
cce(y_true, y_pred).numpy()
Now with weight implementation:
y_true = [[0, 1, 0], [0, 0, 1]]
y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
select_one_sum_indexes = tf.where(tf.equal(y_true,1))
Class_index= tf.gather(select_one_sum_indexes, 1, axis=1)
Class_index=tf.cast(Class_index, tf.int32)
Define your weight Here:
weight=tf.constant([1.2, 1,10.])#,dtype=tf.float32)
Dic for class and weight
table = tf.lookup.StaticHashTable(
initializer=tf.lookup.KeyValueTensorInitializer(
keys=tf.constant([0, 1, 2]),
values=weight
),default_value=1.)
weight_sample_class = table.lookup(Class_index)
Loss with weight_class
cce = tf.keras.losses.CategoricalCrossentropy()
cce(y_true, y_pred,sample_weight=weight_sample_class).numpy()

Difference about "BinaryCrossentropy" and "binary_crossentropy" in tf.keras.losses?

I'm training a model using TensorFlow 2.0 using tf.GradientTape(), but I find that the model's accuracy is 95% if I use tf.keras.losses.BinaryCrossentropy, but degrade to 75% if I use tf.keras.losses.binary_crossentropy. So I'm confused about the difference about the same metric here?
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
def read_data():
red_wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")
white_wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";")
red_wine["type"] = 1
white_wine["type"] = 0
wines = red_wine.append(white_wine)
return wines
def get_x_y(df):
x = df.iloc[:, :-1].values.astype(np.float32)
y = df.iloc[:, -1].values.astype(np.int32)
return x, y
def build_model():
inputs = layers.Input(shape=(12,))
dense1 = layers.Dense(12, activation="relu", name="dense1")(inputs)
dense2 = layers.Dense(9, activation="relu", name="dense2")(dense1)
outputs = layers.Dense(1, activation = "sigmoid", name="outputs")(dense2)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
return model
def generate_dataset(df, batch_size=32, shuffle=True, train_or_test = "train"):
x, y = get_x_y(df)
ds = tf.data.Dataset.from_tensor_slices((x, y))
if shuffle:
ds = ds.shuffle(10000)
if train_or_test == "train":
ds = ds.batch(batch_size)
else:
ds = ds.batch(len(df))
return ds
# loss_object = tf.keras.losses.binary_crossentropy
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
def train_step(model, optimizer, x, y):
with tf.GradientTape() as tape:
pred = model(x, training=True)
loss = loss_object(y, pred)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
def train_model(model, train_ds, epochs=10):
for epoch in range(epochs):
print(epoch)
for x, y in train_ds:
train_step(model, optimizer, x, y)
def main():
data = read_data()
train, test = train_test_split(data, test_size=0.2, random_state=23)
train_ds = generate_dataset(train, 32, True, "train")
test_ds = generate_dataset(test, 32, False, "test")
model = build_model()
train_model(model, train_ds, 10)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
model.evaluate(test_ds)
main()

They should indeed work the same; BinaryCrossentropy uses binary_crossentropy, with difference apparent in docstring descriptions; former's intended for two class labels, whereas later supports an arbitrary class count. However, if passing in targets in expected format, both apply same preprocessing before calling backend's binary_crossentropy, which does the actual computing.
The difference you observe is likely a reproducibility issue; ensure you set the random seed - see function below. For a more complete answer on reproducibility, see here.
Function
def reset_seeds(reset_graph_with_backend=None):
if reset_graph_with_backend is not None:
K = reset_graph_with_backend
K.clear_session()
tf.compat.v1.reset_default_graph()
print("KERAS AND TENSORFLOW GRAPHS RESET") # optional
np.random.seed(1)
random.seed(2)
tf.compat.v1.set_random_seed(3)
print("RANDOM SEEDS RESET") # optional
Usage:
import tensorflow as tf
import tensorflow.keras.backend as K
reset_seeds(K)

Thanks, I find the reasons of the inconsistent accuracy:
The shape of outputs in the model is (None, 1), but the feeded label is (None, ), which cause a wrong meaning with python's broadcast mechanism.
In the source code of tf.keras.losses.BinaryCrossentropy(), while calculating the loss, both y_pred and y_true are processed through a function called squeeze_or_expand_dimensions, which is lacked in tf.keras.losses.binary_crossentropy.
Note: Take care that whether the shape is consistent between input data and model outputs.

Computing gradients wrt model inputs in Tensorflow eager mode

I am interested in calculating gradients wrt. the inputs of a keras model in Tensorflow. I understand that previously this can be done by building a graph and using tf.gradients. For example here. However I would like to achieve this while experimenting in eager mode (possibly using GradientTape). Specifically, if my network has two inputs (x, y), and predicts (u, v, p) calculate e.g., du/dx for use in the loss.
Snippit below, full code at this gist.
model = tf.keras.Sequential([
tf.keras.layers.Dense(20, activation=tf.nn.relu, input_shape=(2,)), # input shape required
tf.keras.layers.Dense(20, activation=tf.nn.relu),
tf.keras.layers.Dense(20, activation=tf.nn.relu),
tf.keras.layers.Dense(20, activation=tf.nn.relu),
tf.keras.layers.Dense(3)
])
def loss(model: tf.keras.Model, inputs, outputs):
u_true, v_true = outputs[:, 0], outputs[:, 1]
prediction = model(inputs)
u_pred, v_pred = prediction[:, 0], prediction[:, 1]
loss_value = tf.reduce_mean(tf.square(u_true - u_pred)) + \
tf.reduce_mean(tf.square(v_true - v_pred))
return loss_value, u_pred, v_pred
def grad(model: tf.keras.Model, inputs, outputs):
"""
:param inputs: (batch_size, 2) -> x, y
:param outputs: (batch_size, 3) -> vx, vy, p
:return:
"""
with tf.GradientTape() as tape:
loss_value, u_pred, v_pred = loss(model, inputs, outputs)
# AttributeError: 'DeferredTensor' object has no attribute '_id'
print(tape.gradient(u_pred, model.input))
grads = tape.gradient(loss_value, model.trainable_variables)
return loss_value, grads
I've tried a few things, e.g. tape.gradient(u_pred, model.input) or tape.gradient(model.output, model.input) but these throw:
AttributeError: 'DeferredTensor' object has no attribute '_id'
Is there a way to achieve this within eager mode and if so how?

Here is an example of retrieving the gradients of the predictions with respect to the inputs using eager execution
Basically, you need to use tape.watch(inputs) [I am using features in my example - whatever you want to call your x ... ] for Tensorflow to record the change in the model output (you can do the same with loss) with respect to the inputs... (and make sure to call your tape.gradient outside of the with tf.GradientTape() context)
Look at the get_gradients function below ...
Hope this helps!
model = tf.keras.Sequential([
tf.keras.layers.Dense(10, activation=tf.nn.relu, input_shape=(len(numeric_headers),)), # input shape required
tf.keras.layers.Dense(10, activation=tf.nn.relu),
tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
# model = MyModel()
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
def get_gradients(model, features):
with tf.GradientTape() as tape:
tape.watch(features)
predictions = model(features)
gradients = tape.gradient(predictions, features)
return gradients
def train_step(features, label):
with tf.GradientTape() as tape:
predictions = model(features)
loss = loss_object(label, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_accuracy(label, predictions)
def test_step(features, label):
predictions = model(features)
t_loss = loss_object(label, predictions)
test_loss(t_loss)
test_accuracy(label, predictions)
EPOCHS = 5
for epoch in range(EPOCHS):
for features, labels in train_ds:
train_step(features, labels)
for features, labels in train_ds:
test_step(features, labels)
template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
print (template.format(epoch+1,
train_loss.result(),
train_accuracy.result()*100,
test_loss.result(),
test_accuracy.result()*100))
if epoch == EPOCHS - 1:
for features, labels in train_ds:
print ('-')
print (get_gradients(model, features))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Tensorflow gradients are None ('No gradients provided for any variable') - python

The model prediction line y_pred = model.call(x_train) has to be inside the with tf.GradientTape() as tape: scope.

Related

Slicing Error when using custom cost function in Keras

How do I add noise to the weights when calculating the loss with Keras?

Weighted categorical cross entropy

Difference about "BinaryCrossentropy" and "binary_crossentropy" in tf.keras.losses?

Computing gradients wrt model inputs in Tensorflow eager mode

Categories

Resources