I'm training a model using TensorFlow 2.0 using tf.GradientTape(), but I find that the model's accuracy is 95% if I use tf.keras.losses.BinaryCrossentropy, but degrade to 75% if I use tf.keras.losses.binary_crossentropy. So I'm confused about the difference about the same metric here?
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
def read_data():
red_wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")
white_wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";")
red_wine["type"] = 1
white_wine["type"] = 0
wines = red_wine.append(white_wine)
return wines
def get_x_y(df):
x = df.iloc[:, :-1].values.astype(np.float32)
y = df.iloc[:, -1].values.astype(np.int32)
return x, y
def build_model():
inputs = layers.Input(shape=(12,))
dense1 = layers.Dense(12, activation="relu", name="dense1")(inputs)
dense2 = layers.Dense(9, activation="relu", name="dense2")(dense1)
outputs = layers.Dense(1, activation = "sigmoid", name="outputs")(dense2)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
return model
def generate_dataset(df, batch_size=32, shuffle=True, train_or_test = "train"):
x, y = get_x_y(df)
ds = tf.data.Dataset.from_tensor_slices((x, y))
if shuffle:
ds = ds.shuffle(10000)
if train_or_test == "train":
ds = ds.batch(batch_size)
else:
ds = ds.batch(len(df))
return ds
# loss_object = tf.keras.losses.binary_crossentropy
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
def train_step(model, optimizer, x, y):
with tf.GradientTape() as tape:
pred = model(x, training=True)
loss = loss_object(y, pred)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
def train_model(model, train_ds, epochs=10):
for epoch in range(epochs):
print(epoch)
for x, y in train_ds:
train_step(model, optimizer, x, y)
def main():
data = read_data()
train, test = train_test_split(data, test_size=0.2, random_state=23)
train_ds = generate_dataset(train, 32, True, "train")
test_ds = generate_dataset(test, 32, False, "test")
model = build_model()
train_model(model, train_ds, 10)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
model.evaluate(test_ds)
main()
They should indeed work the same; BinaryCrossentropy uses binary_crossentropy, with difference apparent in docstring descriptions; former's intended for two class labels, whereas later supports an arbitrary class count. However, if passing in targets in expected format, both apply same preprocessing before calling backend's binary_crossentropy, which does the actual computing.
The difference you observe is likely a reproducibility issue; ensure you set the random seed - see function below. For a more complete answer on reproducibility, see here.
Function
def reset_seeds(reset_graph_with_backend=None):
if reset_graph_with_backend is not None:
K = reset_graph_with_backend
K.clear_session()
tf.compat.v1.reset_default_graph()
print("KERAS AND TENSORFLOW GRAPHS RESET") # optional
np.random.seed(1)
random.seed(2)
tf.compat.v1.set_random_seed(3)
print("RANDOM SEEDS RESET") # optional
Usage:
import tensorflow as tf
import tensorflow.keras.backend as K
reset_seeds(K)
Thanks, I find the reasons of the inconsistent accuracy:
The shape of outputs in the model is (None, 1), but the feeded label is (None, ), which cause a wrong meaning with python's broadcast mechanism.
In the source code of tf.keras.losses.BinaryCrossentropy(), while calculating the loss, both y_pred and y_true are processed through a function called squeeze_or_expand_dimensions, which is lacked in tf.keras.losses.binary_crossentropy.
Note: Take care that whether the shape is consistent between input data and model outputs.
Related
I am facing a weird problem. I am training my TF model using custom training loops. If I use only dense layers as my 1st layer, the model does not seem to train (I am using flattened MNIST dataset
to train). If I use use a flattened layer on top of my already flattend dataset, the model training seems to be working fine.
Note - The reason why I am using a flattened layer on an already flattend dataset is to show that only using dense layer as the first layer is not working. If I use a conv2d layer on non flattened data, then also model trains perfectly. The issue seems to lie on the dense layers for some reason.
Can't seem to find the issue.
Tensorflow version - 2.9.1
Python version - 3.8.6
Model that works
class CustomModel(keras.Model):
def __init__(self, num_classes, name = None):
super().__init__(name = name)
self._flatten = tf.keras.layers.Flatten()
self._dense1 = tf.keras.layers.Dense(64)
self._dense2 = tf.keras.layers.Dense(num_classes)
#tf.function
def call(self, X, training=False):
X = self._flatten(X)
X = tf.nn.relu(self._dense1(X))
return self._dense2(X)
Model that does not work
class CustomModel(keras.Model):
def __init__(self, num_classes, name = None):
super().__init__(name = name)
self._dense1 = tf.keras.layers.Dense(64)
self._dense2 = tf.keras.layers.Dense(num_classes)
#tf.function
def call(self, X, training=False):
X = tf.nn.relu(self._dense1(X))
return self._dense2(X)
Dataset Used
import tensorflow_datasets as tfds
(ds_train, ds_test), ds_info = tfds.load(
"mnist",
split = ["train", "test"],
shuffle_files = True,
as_supervised = True,
with_info = True
)
def normalize_img(image, label):
return tf.cast(image, tf.float32) / 255.0, label
def flatten_img(image, label):
return tf.reshape(image, [-1, 28 * 28]), label
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 64
# # Train Dataset
ds_train = ds_train.map(normalize_img, num_parallel_calls = AUTOTUNE)
ds_train = ds_train.map(flatten_img, num_parallel_calls = AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
ds_train = ds_train.batch(BATCH_SIZE)
ds_train = ds_train.prefetch(AUTOTUNE)
# # Test Dataset
ds_test = ds_test.map(normalize_img, num_parallel_calls = AUTOTUNE)
ds_test = ds_test.map(flatten_img, num_parallel_calls = AUTOTUNE)
ds_test = ds_test.batch(BATCH_SIZE)
ds_test = ds_test.prefetch(AUTOTUNE)
Custom Training Loop
model = CustomModel(10)
num_epochs = 5
optimizer = keras.optimizers.Adam()
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
acc_metric = keras.metrics.SparseCategoricalAccuracy()
#tf.function
def train_epoch(x, y):
with tf.GradientTape() as tape:
y_pred = model(x, training = True)
loss = loss_fn(y, y_pred)
# Getting Gradients
gradients = tape.gradient(loss, model.trainable_weights)
# Back Prop
optimizer.apply_gradients(zip(gradients, model.trainable_weights))
acc_metric.update_state(y, y_pred)
return loss
# Training Loop
for epoch in range(num_epochs):
print(f"\nStart of Training Epoch {epoch + 1}")
for batch_idx, (x_batch, y_batch) in tqdm(enumerate(ds_train), total=len(ds_train)):
loss = train_epoch(x_batch, y_batch)
print(f"Accuracy :- {acc_metric.result()}, Loss :- {loss}")
acc_metric.reset_states()
Flatten Layer Model Stats
No Flatten Layer Model Stats
There are two errors in your flatten_img function. First of all, as Frightera pointed out, you don't need to divide by 255 again. Second, you have one dimension too many. Your flatten_img function outputs a tensor with a shape of (1, 784) instead of (784). That's the reason why the model needs a Flatten() layer in order to work. Replace your code with this and it will work just fine:
return tf.reshape(image, [28 * 28]), label
I am new to Keras and am trying to customize my training step in Keras.
Quesions:
How to create the new variable weights_right using weights_right=weights- (lr+alpha)*gradients in Keras when customizing training loop?
How to feedforward the NN with weights as formal parameters? Could I customize the forward function in Keras like the code in the following below?
Background:
In the stochastic gradient descent algorithm, after feedforwarding a mini-batch data and getting the gradients on this mini-batch data, I would like to perturb the weights and create a new variable called weights_right weights_righ t= weights-(lr+alpha)*gradients (alpha is a const) and then feedforward the NN with weights_right to get the new loss.
Some parts of code in python are the following:
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.weights = [np.random.randn(y,1) for y in sizes[1:]]
self.biases = [np.random.randn(y,x) for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a, weights=None, biases=None):
"""Return the output of the network if ``a`` is input."""
if weights is None:
weights=self.weights
if biases is None:
biases=self.biases
#!!! Note the output layer has no activation for regression.
for b, w in zip(biases[:-1], weights[:-1]):
a = sigmoid(np.dot(w, a)+b)
a=np.dot(weights[-1],a)+biases[-1]
return a
#-----------------------------------
# The following is the important one.
#-----------------------------------
def customSGD():
for epoch in range(epochs):
random.shuffle(training_data)
mini_batches= [training_data[k:k+mini_batch_size] for k in range(0, len(training_data), mini_batch_size)]
for mini_batch in mini_batches:
gradients_on_mini_batch = get_gradients(mini_batch)
#---------------------------------------
# The following two steps are what
# I would like to archive in Keras
#---------------------------------------
# Creat new variable called weights_right
weights_right = weights-(lr+alpha)*gradients_on_mini_batch
# feed the NN with weights_right, note that the params
#in current NN are still weights, not weights_right.
pred_right = feedforward(training_data, weights_right)
loss_right = loss_func(pred_right, training_labels)
......
# update weights
weights = weights-lr*gradients_on_mini_batch
Above codes are mainly from the online book Michael Nielsen.
Any help would be appreciated. Thank you so much!
In a custom training loop, you can do whatever you like with the gradients and weights.
#tf.function
def train_step(inputs, labels):
with tf.GradientTape() as tape:
logits = model(inputs)
loss = loss_object(labels, logits)
weights = model.trainable_variables
# add manipulation of weights here
gradients = tape.gradient(loss, weights)
opt.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_acc(labels, logits)
Here's the full running example:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
dataset = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(150)
train_dataset = dataset.take(120).batch(4)
test_dataset = dataset.skip(120).take(30).batch(4)
class DenseModel(Model):
def __init__(self):
super(DenseModel, self).__init__()
self.dens1 = Dense(8, activation='elu')
self.dens2 = Dense(16, activation='relu')
self.dens3 = Dense(3)
def call(self, inputs, training=None, **kwargs):
x = self.dens1(inputs)
x = self.dens2(x)
x = self.dens3(x)
return x
model = DenseModel()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_loss = tf.keras.metrics.Mean()
test_loss = tf.keras.metrics.Mean()
train_acc = tf.keras.metrics.SparseCategoricalAccuracy()
test_acc = tf.keras.metrics.SparseCategoricalAccuracy()
opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
#tf.function
def train_step(inputs, labels):
with tf.GradientTape() as tape:
logits = model(inputs)
loss = loss_object(labels, logits)
weights = model.trainable_variables
# add manipulation of weights here
gradients = tape.gradient(loss, weights)
opt.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_acc(labels, logits)
#tf.function
def test_step(inputs, labels):
logits = model(inputs)
loss = loss_object(labels, logits)
test_loss(loss)
test_acc(labels, logits)
for epoch in range(10):
template = 'Epoch {:>2} Train Loss {:.3f} Test Loss {:.3f} ' \
'Train Acc {:.2f} Test Acc {:.2f}'
train_loss.reset_states()
test_loss.reset_states()
train_acc.reset_states()
test_acc.reset_states()
for X_train, y_train in train_dataset:
train_step(X_train, y_train)
for X_test, y_test in test_dataset:
test_step(X_test, y_test)
print(template.format(
epoch + 1,
train_loss.result(),
test_loss.result(),
train_acc.result(),
test_acc.result()
))
I would like to know how to add metrics like accuracy,precision and save model to this tensorboard logistic regression:
from tensorflow.keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split
import tensorflow as tf
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
x_train, x_test = x_train/255., x_test/255.
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.15)
x_train = tf.reshape(x_train, shape=(-1, 784))
x_test = tf.reshape(x_test, shape=(-1, 784))
weights = tf.Variable(tf.random.normal(shape=(784, 10), dtype=tf.float64))
biases = tf.Variable(tf.random.normal(shape=(10,), dtype=tf.float64))
def logistic_regression(x):
lr = tf.add(tf.matmul(x, weights), biases)
#return tf.nn.sigmoid(lr)
return lr
def cross_entropy(y_true, y_pred):
y_true = tf.one_hot(y_true, 10)
loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
return tf.reduce_mean(loss)
def accuracy(y_true, y_pred):
y_true = tf.cast(y_true, dtype=tf.int32)
preds = tf.cast(tf.argmax(y_pred, axis=1), dtype=tf.int32)
preds = tf.equal(y_true, preds)
return tf.reduce_mean(tf.cast(preds, dtype=tf.float32))
def grad(x, y):
with tf.GradientTape() as tape:
y_pred = logistic_regression(x)
loss_val = cross_entropy(y, y_pred)
return tape.gradient(loss_val, [weights, biases])
n_batches = 10000
learning_rate = 0.01
batch_size = 128
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
dataset = dataset.repeat().shuffle(x_train.shape[0]).batch(batch_size)
optimizer = tf.optimizers.SGD(learning_rate)
for batch_numb, (batch_xs, batch_ys) in enumerate(dataset.take(n_batches), 1):
gradients = grad(batch_xs, batch_ys)
optimizer.apply_gradients(zip(gradients, [weights, biases]))
y_pred = logistic_regression(batch_xs)
loss = cross_entropy(batch_ys, y_pred)
acc = accuracy(batch_ys, y_pred)
print("Batch number: %i, loss: %f, accuracy: %f" % (batch_numb, loss, acc))
i'm new to tensor and I only got write logs in tensorflow 1.x
When with tf.Session as sess left from tensorflow i get lost
in the other ways of making code.
Your code will look something like this using simple Tensorflow V2:
Start with the model creation, Logistic regression can be seen as a single layer perceptron with sigmoid activation so we will add an input layer with as many inputs as features and one output layer with sigmoid activation per each output class.
input = tf.keras.Input(shape=(nfeatures))
output = tf.keras.layers.Dense(nclasses,activation='sigmoid')(input)
model = tf.keras.Model(inputs=input,outputs=output,name='MyLinearRegression')
Than we create the optimizer and the loss function:
opt = tf.keras.optimizers.Adadelta()
lss = tf.keras.losses.categorical_crossentropy
met = tf.keras.metrics.Accuracy()
You have to use categorica_crossentropy or sparse_categorical_crossentropy depending on the labels (hot encoded or not). For this loss you may want to change the activation to softmax.
Now we can "compile" the model this way:
model.compile(optimizer=opt,loss=lss,metrics=met)
model.summary()
So now we can create the TensorBoard callback:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir,write_graph=True,update_freq='batch')
And than train like this:
model.fit(train,epochs=100,callbacks=[tensorboard_callback],validation_data = val)
If your dataset is a numpy dataset you can create a TF dataset like this:
dataset = tf.data.Dataset.from_tensor_slices((features,labels))
train = dataset.take(train_size)
test = dataset.skip(train_size).batch(batchsize)
val = test.skip(test_size).batch(batchsize)
test = test.take(test_size).batch(batchsize)
Where train is your train dataset, val the validation one and test the test dataset.
First of all you have to create a callback function to update Tensorboard:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir,write_graph=True,update_freq='batch')
Where logdir is a string to an existing directory.
than you can compile your model like this:
model.compile(optimizer=opt,loss=lss,metrics=met)
Where obviously opt is your optimizer, lss your loss function and optionally met your metrics.
Now you can train the model like this:
model.fit(train,epochs=100,callbacks=[tensorboard_callback],validation_data = val)
It will create a Tensorboard instance and you will see the address in the terminal. You can copy the link in your browser and see it.
I see that you are still using the "old" Tensorflow v1 way of compiling and creating the model. Using model.compile and model.fit is easyer and faster (in my opinion) if you aren't using particoular training methods. (You can't greate GANs this way for example).
I am trying to calculate the recall in both binary and multi class (one hot encoded) classification scenarios for each class after each epoch in a model that uses Tensorflow 2's Keras API. e.g. for binary classification I'd like to be able to do something like
import tensorflow as tf
model = tf.keras.Sequential()
model.add(...)
model.add(tf.keras.layers.Dense(1))
model.compile(metrics=[binary_recall(label=0), binary_recall(label=1)], ...)
history = model.fit(...)
plt.plot(history.history['binary_recall_0'])
plt.plot(history.history['binary_recall_1'])
plt.show()
or in a multi class scenario I'd like to do something like
model = tf.keras.Sequential()
model.add(...)
model.add(tf.keras.layers.Dense(3))
model.compile(metrics=[recall(label=0), recall(label=1), recall(label=2)], ...)
history = model.fit(...)
plt.plot(history.history['recall_0'])
plt.plot(history.history['recall_1'])
plt.plot(history.history['recall_2'])
plt.show()
I'm working on a classifier for an unbalanced dataset and want to be able to see at what point the recall for my minority class(s) starts to degrade.
I found an implementation of precision for a specific class in a multi-class classifier here https://stackoverflow.com/a/41717938/373655. I'm am trying to adapt this into what I need but keras.backend is still pretty foreign to me so any help would be greatly appreciated.
I am also not clear on if I can use Keras metrics (as they are calculated at the end of each batch and then averaged) or if I need to use Keras callbacks (which can run at the end of each epoch). It seems to me like it shouldn't make a difference for recall (e.g. 8/10 == (3/5 + 5/5) / 2) but this is why recall was removed in Keras 2 so maybe I'm missing something (https://github.com/keras-team/keras/issues/5794)
Edit - partial solution (multi-class classification)
#mujjiga's solution works for both binary classification and multi-class classification but as #P-Gn pointed out, tensorflow 2's Recall metric supports this out of the box for multi-class classification. e.g.
from tensorflow.keras.metrics import Recall
model = ...
model.compile(loss='categorical_crossentropy', metrics=[
Recall(class_id=0, name='recall_0')
Recall(class_id=1, name='recall_1')
Recall(class_id=2, name='recall_2')
])
history = model.fit(...)
plt.plot(history.history['recall_2'])
plt.plot(history.history['val_recall_2'])
plt.show()
We can use classification_report of sklearn and keras Callback to achieve this.
Working code sample (with comments)
import tensorflow as tf
import keras
from tensorflow.python.keras.layers import Dense, Input
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.callbacks import Callback
from sklearn.metrics import recall_score, classification_report
from sklearn.datasets import make_classification
import numpy as np
import matplotlib.pyplot as plt
# Model -- Binary classifier
binary_model = Sequential()
binary_model.add(Dense(16, input_shape=(2,), activation='relu'))
binary_model.add(Dense(8, activation='relu'))
binary_model.add(Dense(1, activation='sigmoid'))
binary_model.compile('adam', loss='binary_crossentropy')
# Model -- Multiclass classifier
multiclass_model = Sequential()
multiclass_model.add(Dense(16, input_shape=(2,), activation='relu'))
multiclass_model.add(Dense(8, activation='relu'))
multiclass_model.add(Dense(3, activation='softmax'))
multiclass_model.compile('adam', loss='categorical_crossentropy')
# callback to find metrics at epoch end
class Metrics(Callback):
def __init__(self, x, y):
self.x = x
self.y = y if (y.ndim == 1 or y.shape[1] == 1) else np.argmax(y, axis=1)
self.reports = []
def on_epoch_end(self, epoch, logs={}):
y_hat = np.asarray(self.model.predict(self.x))
y_hat = np.where(y_hat > 0.5, 1, 0) if (y_hat.ndim == 1 or y_hat.shape[1] == 1) else np.argmax(y_hat, axis=1)
report = classification_report(self.y,y_hat,output_dict=True)
self.reports.append(report)
return
# Utility method
def get(self, metrics, of_class):
return [report[str(of_class)][metrics] for report in self.reports]
# Generate some train data (2 class) and train
x, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
metrics_binary = Metrics(x,y)
binary_model.fit(x, y, epochs=30, callbacks=[metrics_binary])
# Generate some train data (3 class) and train
x, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1, n_classes=3)
y = keras.utils.to_categorical(y,3)
metrics_multiclass = Metrics(x,y)
multiclass_model.fit(x, y, epochs=30, callbacks=[metrics_multiclass])
# Plotting
plt.close('all')
plt.plot(metrics_binary.get('recall',0), label='Class 0 recall')
plt.plot(metrics_binary.get('recall',1), label='Class 1 recall')
plt.plot(metrics_binary.get('precision',0), label='Class 0 precision')
plt.plot(metrics_binary.get('precision',1), label='Class 1 precision')
plt.plot(metrics_binary.get('f1-score',0), label='Class 0 f1-score')
plt.plot(metrics_binary.get('f1-score',1), label='Class 1 f1-score')
plt.legend(loc='lower right')
plt.show()
plt.close('all')
for m in ['recall', 'precision', 'f1-score']:
for c in [0,1,2]:
plt.plot(metrics_multiclass.get(m,c), label='Class {0} {1}'.format(c,m))
plt.legend(loc='lower right')
plt.show()
Output
Advantages:
classification_report provides lots of metrics
Can calculate metrics on validation data on train data by passing the same to Metrics constructor.
In TF2, tf.keras.metrics.Recall gained a class_id member that enables to do just that. Example using FashionMNIST:
import tensorflow as tf
(x_train, y_train), _ = tf.keras.datasets.fashion_mnist.load_data()
x_train = x_train[..., None].astype('float32') / 255
y_train = tf.keras.utils.to_categorical(y_train)
input_shape = x_train.shape[1:]
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=input_shape),
tf.keras.layers.MaxPool2D(pool_size=2),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'),
tf.keras.layers.MaxPool2D(pool_size=2),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(units=256, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(units=10, activation='softmax')])
model.compile(loss='categorical_crossentropy', optimizer='Adam',
metrics=[tf.keras.metrics.Recall(class_id=i) for i in range(10)])
model.fit(x_train, y_train, batch_size=128, epochs=50)
In TF 1.13, tf.keras.metric.Recall does not have this class_id argument, but it can be added by subclassing (something that, somewhat suprisingly, seems impossible in the alpha release of TF2).
class Recall(tf.keras.metrics.Recall):
def __init__(self, *, class_id, **kwargs):
super().__init__(**kwargs)
self.class_id= class_id
def update_state(self, y_true, y_pred, sample_weight=None):
y_true = y_true[:, self.class_id]
y_pred = tf.cast(tf.equal(
tf.math.argmax(y_pred, axis=-1), self.class_id), dtype=tf.float32)
return super().update_state(y_true, y_pred, sample_weight)
There are multiple ways to do this but using a callback seems the best and most kerasy way of doing it. One side-note before I show you how:
I am also not clear on if I can use Keras metrics (as they are
calculated at the end of each batch and then averaged) or if I need to
use Keras callbacks (which can run at the end of each epoch).
This is not true. Keras' callbacks can use the following methods:
on_epoch_begin: called at the beginning of every epoch.
on_epoch_end: called at the end of every epoch.
on_batch_begin: called at the beginning of every batch.
on_batch_end: called at the end of every batch.
on_train_begin: called at the beginning of model training.
on_train_end: called at the end of model training.
This is true regardless of whether you are using keras or tf.keras.
Below you can find my implementation of a custom callback.
class RecallHistory(keras.callbacks.Callback):
def on_train_begin(self, logs={}):
self.recall = {}
def on_epoch_end(self, epoch, logs={}):
# Compute and store recall for each class here.
self.recall[...] = 42
history = RecallHistory()
model.fit(..., callbacks=[history])
print(history.recall)
For several days now, I'm trying to replicate my keras training results with pytorch. Whatever I do, the pytorch model will overfit far earlier and stronger to the validation set then in keras. For pytorch I use the same XCeption Code from https://github.com/Cadene/pretrained-models.pytorch.
The dataloading, the augmentation, the validation, the training schedule etc. are equivalent. Am I missing something obvious? There must be a general problem somewhere. I tried thousands of different module constellations, but nothing seems to come even close to the keras training. Can somebody help?
Keras model: val accuracy > 90%
# base model
base_model = applications.Xception(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))
# top model
x = base_model.output
x = GlobalMaxPooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(4, activation='softmax')(x)
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)
# Compile model
from keras import optimizers
adam = optimizers.Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy',
optimizer=adam, metrics=['accuracy'])
# LROnPlateau etc. with equivalent settings as pytorch
Pytorch model: val accuracy ~81%
from xception import xception
import torch.nn.functional as F
# modified from https://github.com/Cadene/pretrained-models.pytorch
class XCeption(nn.Module):
def __init__(self, num_classes):
super(XCeption, self).__init__()
original_model = xception(pretrained="imagenet")
self.features=nn.Sequential(*list(original_model.children())[:-1])
self.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)
def logits(self, features):
x = F.relu(features)
x = F.adaptive_max_pool2d(x, (1, 1))
x = x.view(x.size(0), -1)
x = self.last_linear(x)
return x
def forward(self, input):
x = self.features(input)
x = self.logits(x)
return x
device = torch.device("cuda")
model=XCeption(len(class_names))
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model)
model.to(device)
criterion = nn.CrossEntropyLoss(size_average=False)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
Thank you very much!
Update:
Settings:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
model = train_model(model, train_loader, val_loader,
criterion, optimizer, scheduler,
batch_size, trainmult=8, valmult=10,
num_epochs=200, epochs_top=0)
Cleaned training function:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, batch_size, trainmult=1, valmult=1, num_epochs=None, epochs_top=0):
for epoch in range(num_epochs):
for phase in ['train', 'val']:
running_loss = 0.0
running_acc = 0
total = 0
# Iterate over data.
if phase=="train":
model.train(True) # Set model to training mode
for i in range(trainmult):
for data in train_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs) # notinception
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
loss.backward()
optimizer.step()
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
train_loss=(running_loss/total)
train_acc=(running_acc.double()/total)
else:
model.train(False) # Set model to evaluate mode
with torch.no_grad():
for i in range(valmult):
for data in val_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels.data)
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
val_loss=(running_loss/total)
val_acc=(running_acc.double()/total)
scheduler.step(val_loss)
return model
it may be because type of weight initialization you are using
otherwise this should not happen
try with same initializer in both the models
self.features=nn.Sequential(*list(original_model.children())[:-1])
Are you sure that this line re-instantiates your model in exactly the same way? You're using a NN.Sequential instead of the original XCeption model's forward function. If there's anything in that forward function that isn't the exact same as using a nn.Sequential, it will not reproduce the same performance.
Instead of wrapping it in a Sequential, you could just change this
my_model = Xception()
# load weights before you change the architecture
my_model = load_weights(path_to_weights)
# overwrite the original's last_linear with your own
my_model.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)