I'm new to Tensorflow and I'm trying to rebuild a simple network, that I've built in Keras (TF backend), with Tensorflows Python API. It is a simple function approximator (z = sin(x + y)).
I've tried different architectures, optimizers and learning rates, but I'm not getting the new network to train properly. However in my eyes, the networks seem to be identical. Both get the exact same feature vectors and labels:
# making training data
start = 0
end = 2*np.pi
samp = 1000
num_samp = samp**2
step = end / samp
x_train = np.arange(start, end, step)
y_train = np.arange(start, end, step)
data = np.array(np.meshgrid(x_train,y_train)).T.reshape(-1,2)
z_label = np.sin(data[:,0] + data[:,1])
Here is the Keras model:
#start model
model = Sequential()
#stack layers
model.add(Dense(units=128, activation='sigmoid', input_dim=2, name='dense_1'))
model.add(Dense(units=64, activation='sigmoid', input_dim=128, name='dense_2'))
model.add(Dense(units=1, activation='linear', name='output'))
#compile model
model.compile(loss='mean_squared_error',
optimizer='sgd',
metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath='./weights/weights.h5',
verbose=1, save_best_only=True)
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
model.fit(data, z_label, epochs=20, batch_size=32,
shuffle='true',validation_data=(data_val, z_label_val),
callbacks=[checkpointer, tensorboard])
Here is the new network, built with Tensorflows Python API:
# hyperparameter
n_inputs = 2
n_hidden1 = 128
n_hidden2 = 64
n_outputs = 1
learning_rate = 0.01
# construction phase
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='input')
y = tf.placeholder(tf.float32, shape=(None), name="target")
hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", activation=tf.nn.sigmoid)
hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2", activation=tf.nn.sigmoid)
logits = tf.layers.dense(hidden2, n_outputs, activation='linear', name='output')
loss = tf.reduce_mean(tf.square(logits - y), name='loss')
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss, name='train')
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# --- execution phase ---
n_epochs = 40
batch_size = 32
n_batches = int(num_samp/batch_size)
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
print("Epoch: ", epoch, " Running...")
loss_arr = np.array([])
for iteration in range( n_batches ):
start = iteration * batch_size
end = start + batch_size
sess.run(training_op, feed_dict={X: data[start:end], y: z_label[start:end] })
loss_arr = np.append(loss_arr, loss.eval(feed_dict={X: data[start:end, :], y: z_label[start:end]}))
mean_loss = np.mean(loss_arr)
print("Epoch: ", epoch, " Calculated ==> Loss: ", mean_loss)
While the Keras model train properly with a decreasing loss and proper test results, the new model converges pretty fast and stops learning. Accordingly the results are completely useless.
Am I building/training the the model incorrectly or is Keras doing anything in the background, that I'm not aware of?
Solved this issue. The problem was the shape of the label vector. It was a lying vector with shape (1000000,). While Keras is apparently capable of dealing with different shapes of output and label vectors, Tensorflow initialized the placeholder incorrectly and the loss function
loss = tf.reduce_mean(tf.square(logits - y), name='loss')
did't make sense anymore and thus training failed. Adding
z_label = z_label.reshape(-1,1)
reshaped the label vector to (1000000, 1) and solved it. Alternatively one can specify the shape of the placeholder more precisely
y = tf.placeholder(tf.float32, shape=(None,1), name="target")
Related
I'm trying to develop a small network for the aerial cactus identification challenge.
This is a binary classification challenge (0 no cactus, 1 is cactus), but my networks is always outputting the same cost.
I built a simple network, which works when implemented with keras, but I'm trying to use tensorflow training loop for learning purposes, but can't make it work
My network architecture:
2 Conv 64x3 + Maxpooling
2 Conv 128x3 + Maxpooling
Flatten
Dense 1024
Dense 512
Dense 1
Here is my code:
def process_one_batch(x, y):
inputs = Conv2D(64, 3, activation='relu')(x)
inputs = Conv2D(64, 3, activation='relu')(inputs)
inputs = MaxPooling2D(pool_size=2, strides=2)(inputs)
inputs = Conv2D(128, 3, activation='relu')(inputs)
inputs = Conv2D(128, 3, activation='relu')(inputs)
inputs = MaxPooling2D(pool_size=2, strides=2)(inputs)
flat = Flatten()(block2)
dense1 = Dense(1024, activation="relu")(flat)
dense2 = Dense(512, activation="relu")(dense1)
dense2 = Dense(1, activation='sigmoid')(dense2)
res = dense2
return res
NB_EPOCHS = 5
def create_dataset(X, y, batch_size=BATCH_SIZE, nb_epochs=NB_EPOCHS, batch=True):
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.map(my_process_path)
if batch:
dataset = dataset.batch(batch_size)
dataset = dataset.repeat(nb_epochs)
dataset = dataset.prefetch(buffer_size=2)
iterator = tf.data.make_one_shot_iterator(dataset)
#iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
y_ = process_one_batch(next_element[0], next_element[1])
return dataset, next_element, y_
train_ds, (train_x, train_y), prediction = create_dataset(X_train.values, y_train.values)
test_ds, (test_x, test_y), test_prediction = create_dataset(X_test.values, y_test.values, batch=True)
cross_entropy = tf.reduce_mean(tf.keras.losses.binary_crossentropy(
train_y, tf.reshape(tf.transpose(prediction), [-1]), from_logits=True
))
optimiser = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cross_entropy)
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
train_steps = int(len(X_train.values) / BATCH_SIZE)
val_steps = int(len(X_test.values) / BATCH_SIZE)
# initialise the variables
sess.run(init_op)
print('Init')
for epoch in range(NB_EPOCHS):
avg_cost = 0
train_acc = 0
for i in range(train_steps):
_, c, ac = sess.run([optimiser, cross_entropy, accuracy_train])
print(c)
avg_cost += 1
train_acc += ac
print('train_acc: ', train_acc/train_steps)
print(train_steps, val_steps)
print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(avg_cost / train_steps))
print("\nTraining complete!")
I have tried debugging the model input, it's not always the same, but after the 3rd batch cost is stucked at 0.6931472. Just after I run a simple keras model on the dataset and everything works fine, so I don't thinks it's a data related problem
Any idea would be gladly appreciated
I wish to view the final output of training a tf.keras model. In this case it would be an array of predictions from the softmax function, e.g. [0,0,0,1,0,1].
Other threads on here have suggested using model.predict(training_data), but this won't work for my situation since I am using dropout at training and validation, so neurons are randomly dropped and predicting again with the same data will give a different result.
def get_model():
inputs = tf.keras.layers.Input(shape=(input_dims,))
x = tf.keras.layers.Dropout(rate=dropout_rate)(inputs, training=True)
x = tf.keras.layers.Dense(units=29, activation='relu')(x)
x = tf.keras.layers.Dropout(rate=dropout_rate)(x, training=True)
x = tf.keras.layers.Dense(units=15, activation='relu')(x)
outputs = tf.keras.layers.Dense(2, activation='softmax')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_accuracy'])
return model
myModel = get_model()
myModel.summary()
myModel.fit(X_train, y_train,
batch_size = batch_size,
epochs= epochs,
verbose = 1,
validation_data = (X_val, y_val))
In tensorflow, you can grab the output of a model after training quite easily. Here is an example from a Github repo:
input = tf.placeholder(tf.float32, shape=[None, INPUT_DIMS])
labels = tf.placeholder(tf.float32, shape=[None])
hidden = tf.nn.tanh(make_nn_layer(normalized, NUM_HIDDEN))
logits = make_nn_layer(hidden, NUM_CLASSES)
outputs = tf.argmax(logits, 1)
int_labels = tf.to_int64(labels)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, int_labels, name='xentropy')
train_step = tf.train.AdamOptimizer().minimize(cross_entropy)
correct_prediction = tf.equal(outputs, int_labels)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
validation_dict = {
input: validation_data[:,0:7],
labels: validation_data[:,7],}
for i in range(NUM_BATCHES):
batch = training_data[numpy.random.choice(training_size, BATCH_SIZE, False),:]
train_step.run({input: batch[:,0:7], labels: batch[:,7]})
if i % 100 == 0 or i == NUM_BATCHES - 1:
print('Accuracy %.2f%% at step %d' % (accuracy.eval(validation_dict) * 100, i))
output_data = outputs.eval({input: data_vector[:,0:7]})
The only output I can get from the trained model appears to be a history object. There is also a myModel.output object, but it is a tensor that I can't evaluate without putting data into it. Any ideas?
As far as I know, you can't turn off the dropout after passing training=True when calling the layers (unless you transfer the weights to a new model with the same architecture). However, instead you can build and train your model in normal case (i.e. without using training argument in the calls) and then selectively turn on and off the dropout layer in test phase by defining a backend function (i.e. keras.backend.function()) and setting the learning phase (i.e. keras.backend.learning_phase()):
# build your model normally (i.e. without using `training=True` argument)
# train your model...
from keras import backend as K
func = K.function(model.inputs + [K.learning_phase()], model.outputs)
# run the model with dropout layers being active, i.e. learning_phase == 1
preds = func(list_of_input_arrays + [1])
# run the model with dropout layers being inactive, i.e. learning_phase == 0
preds = func(list_of_input_arrays + [0])
Update: As I suggested above, another approach is to define a new model with the same architecture but without setting training=True, and then transfer the weights from the trained model to this new model. To achieve this, I just add a training argument to your get_model() function:
def get_model(training=None):
inputs = tf.keras.layers.Input(shape=(input_dims,))
x = tf.keras.layers.Dropout(rate=dropout_rate)(inputs, training=training)
x = tf.keras.layers.Dense(units=29, activation='relu')(x)
x = tf.keras.layers.Dropout(rate=dropout_rate)(x, training=training)
x = tf.keras.layers.Dense(units=15, activation='relu')(x)
outputs = tf.keras.layers.Dense(2, activation='softmax')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_accuracy'])
return model
# build a model with dropout layers active in both training and test phases
myModel = get_model(training=True)
# train the model
myModel.fit(...)
# build a clone of the model with dropouts deactivated in test phase
myTestModel = get_model() # note: the `training` is `None` by default
# transfer the weights from the trained model to this model
myTestModel.set_weights(myModel.get_weights())
# use the new model in test phase; the dropouts would not be active
myTestModel.predict(...)
I'm converting a basic LSTM many-to-one architecture to predict the next single element in a sequence, written in Keras to Pytorch. NN architecture is the following (whole code can be found here):
model = Sequential()
model.add(LSTM(
512,
input_shape=(network_input.shape[1], network_input.shape[2]),
return_sequences=True
))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
Running both models with the same data (yes, I've explicitly checked that), both start with a loss value ~ 4, but after 100 epochs or so, Keras already reached a loss ~ 0.02, which gives the desired results.
However, Pytorch model is stuck around ~ 3.4 after 20 epochs. I've tried many things:
Play with LR: It explodes when LR is too high, so this means that at least parameters are being updated.
Different optimizers, SGD, Adam, RMSprop, but same results with all.
Swap between .view[], .squeeze_ and indexing when accessing last sequence element.
Add, remove and modify non-linear activation functions and dropout.
Remove manual initialization for x_0 and h_0.
Here is the code for my model:
class NNP_RNN(nn.Module):
def __init__(self):
super(NNP_RNN, self).__init__()
self.lstm_1 = nn.LSTM(input_size=1, hidden_size=512, batch_first=True)
self.lstm_2 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
self.lstm_3 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
self.dense_1 = nn.Linear(in_features=512, out_features=256)
self.dense_2 = nn.Linear(in_features=256, out_features=58)
def forward(self, x):
batch_size = x.size(0)
h_0 = NNP_RNN.init_hidden((1, batch_size, 512))
c_0 = NNP_RNN.init_hidden((1, batch_size, 512))
x, _ = self.lstm_1(x, (h_0, c_0))
x = F.dropout(x, 0.3)
x, _ = self.lstm_2(x, (h_0, c_0))
x = F.dropout(x, 0.2)
_, (x, _) = self.lstm_3(x, (h_0, c_0))
x = x.squeeze_(0)
x = self.dense_1(x)
x = F.dropout(x, 0.1)
x = self.dense_2(x)
return x
#staticmethod
def init_hidden(dims):
return torch.zeros(dims, device=device)
And the training process:
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, verbose=True, patience=5)
criterion = nn.CrossEntropyLoss()
for epoch in range(1, epochs + 1):
epoch_loss = 0
epoch_corrects = 0
for features, labels in tqdm(data, ncols=800):
features = features.to(device)
labels = labels.to(device)
optimizer.zero_grad()
batch_size = features.size(0)
output = model(features)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
corrects = torch.argmax(output, dim=1)
corrects = torch.eq(corrects, labels).sum().item()
epoch_corrects += corrects
epoch_loss += loss.clone() * batch_size
epoch_loss /= len(data.dataset)
epoch_corrects /= len(data.dataset)
print(f'Loss epoch #{epoch} = {epoch_loss:.10f}, Accuracy = {epoch_corrects}')
scheduler.step(epoch_loss)
For several days now, I'm trying to replicate my keras training results with pytorch. Whatever I do, the pytorch model will overfit far earlier and stronger to the validation set then in keras. For pytorch I use the same XCeption Code from https://github.com/Cadene/pretrained-models.pytorch.
The dataloading, the augmentation, the validation, the training schedule etc. are equivalent. Am I missing something obvious? There must be a general problem somewhere. I tried thousands of different module constellations, but nothing seems to come even close to the keras training. Can somebody help?
Keras model: val accuracy > 90%
# base model
base_model = applications.Xception(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))
# top model
x = base_model.output
x = GlobalMaxPooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(4, activation='softmax')(x)
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)
# Compile model
from keras import optimizers
adam = optimizers.Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy',
optimizer=adam, metrics=['accuracy'])
# LROnPlateau etc. with equivalent settings as pytorch
Pytorch model: val accuracy ~81%
from xception import xception
import torch.nn.functional as F
# modified from https://github.com/Cadene/pretrained-models.pytorch
class XCeption(nn.Module):
def __init__(self, num_classes):
super(XCeption, self).__init__()
original_model = xception(pretrained="imagenet")
self.features=nn.Sequential(*list(original_model.children())[:-1])
self.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)
def logits(self, features):
x = F.relu(features)
x = F.adaptive_max_pool2d(x, (1, 1))
x = x.view(x.size(0), -1)
x = self.last_linear(x)
return x
def forward(self, input):
x = self.features(input)
x = self.logits(x)
return x
device = torch.device("cuda")
model=XCeption(len(class_names))
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model)
model.to(device)
criterion = nn.CrossEntropyLoss(size_average=False)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
Thank you very much!
Update:
Settings:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
model = train_model(model, train_loader, val_loader,
criterion, optimizer, scheduler,
batch_size, trainmult=8, valmult=10,
num_epochs=200, epochs_top=0)
Cleaned training function:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, batch_size, trainmult=1, valmult=1, num_epochs=None, epochs_top=0):
for epoch in range(num_epochs):
for phase in ['train', 'val']:
running_loss = 0.0
running_acc = 0
total = 0
# Iterate over data.
if phase=="train":
model.train(True) # Set model to training mode
for i in range(trainmult):
for data in train_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs) # notinception
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
loss.backward()
optimizer.step()
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
train_loss=(running_loss/total)
train_acc=(running_acc.double()/total)
else:
model.train(False) # Set model to evaluate mode
with torch.no_grad():
for i in range(valmult):
for data in val_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels.data)
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
val_loss=(running_loss/total)
val_acc=(running_acc.double()/total)
scheduler.step(val_loss)
return model
it may be because type of weight initialization you are using
otherwise this should not happen
try with same initializer in both the models
self.features=nn.Sequential(*list(original_model.children())[:-1])
Are you sure that this line re-instantiates your model in exactly the same way? You're using a NN.Sequential instead of the original XCeption model's forward function. If there's anything in that forward function that isn't the exact same as using a nn.Sequential, it will not reproduce the same performance.
Instead of wrapping it in a Sequential, you could just change this
my_model = Xception()
# load weights before you change the architecture
my_model = load_weights(path_to_weights)
# overwrite the original's last_linear with your own
my_model.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)
Edits below
I am in the process of learning about artificial neural networks using the Keras library and in order to ensure that I have a good understanding of the basics of neural network classification, I have been trying to reproduce a neural network written with Keras using only tensorflow. However, I have run into some problems.
training_epochs = 100
n_input = 11
n_hidden_1 = 6
n_hidden_2 = 6
n_output = 1
classifier = Sequential()
classifier.add(Dense(output_dim=n_hidden_1, init='uniform', activation='relu', input_dim=n_input))
classifier.add(Dense(output_dim=n_hidden_2, init='uniform', activation='relu'))
classifier.add(Dense(output_dim=n_output, init='uniform', activation='sigmoid'))
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
classifier.fit(X_train, y_train, batch_size=10, nb_epoch=training_epochs)
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
cm = confusion_matrix(y_test, y_pred)
print(cm)
So essentially I am using a neural network with 2 hidden layers of size 6, an input layer of size 11, and an output of size 1. My output uses the sigmoid function to generate probabilities in order to classify training data into binary categories. I tried to reproduce this with tensorflow as follows:
training_epochs = 100
n_input = 11
n_hidden_1 = 6
n_hidden_2 = 6
n_output = 1
def neuralNetwork(x, weights):
layer_1 = tf.matmul(x, weights['h1'])
layer_1 = tf.nn.relu(layer_1)
layer_2 = tf.matmul(layer_1, weights['h2'])
layer_2 = tf.nn.relu(layer_2)
output_layer = tf.matmul(layer_2, weights['output'])
return output_layer
weights = {
'h1': tf.Variable(tf.random_uniform([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_uniform([n_hidden_1, n_hidden_2])),
'output': tf.Variable(tf.random_uniform([n_hidden_2, n_output]))
}
x = tf.placeholder('float', [None, n_input]) # [?, 11]
y = tf.placeholder('float', [None, n_output]) # [?, 1]
logits = neuralNetwork(x, weights)
prediction = tf.nn.softmax(logits)
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,labels=y))
optimizer = tf.train.AdamOptimizer().minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as session:
session.run(init)
for epoch in range(training_epochs):
loss, accuracy = session.run([optimizer, cost], feed_dict={x:X_train, y:y_train})
print('Epoch: {} Acc: {}'.format(epoch+1, accuracy))
print('Model has completed training.')
However, I keep getting the error:
Cannot feed value of shape (8000,) for Tensor 'Placeholder_1:0', which has shape '(?, 1)
My input data has 8000 rows with 11 columns and my output data has 8000 rows and 1 column. In order to try to reshape my data, I tried feeding it in row by row, but I kept getting more errors. Am I going about this the right way? Any help would be appreciated!
Edit: So I updated my code following the given suggestions. I am now getting output for accuracy, however, it seems to finish at around 4-5%. Furthermore, the accuracy also seems to decrease over time rather than improving. When I increase the number of training epochs to 200, the accuracy dips even lower (to around 2%).
Epoch: 1 Acc: 7.641509056091309
...
...
Epoch: 100 Acc: 4.339457035064697