I'm trying to develop a small network for the aerial cactus identification challenge.
This is a binary classification challenge (0 no cactus, 1 is cactus), but my networks is always outputting the same cost.
I built a simple network, which works when implemented with keras, but I'm trying to use tensorflow training loop for learning purposes, but can't make it work
My network architecture:
2 Conv 64x3 + Maxpooling
2 Conv 128x3 + Maxpooling
Flatten
Dense 1024
Dense 512
Dense 1
Here is my code:
def process_one_batch(x, y):
inputs = Conv2D(64, 3, activation='relu')(x)
inputs = Conv2D(64, 3, activation='relu')(inputs)
inputs = MaxPooling2D(pool_size=2, strides=2)(inputs)
inputs = Conv2D(128, 3, activation='relu')(inputs)
inputs = Conv2D(128, 3, activation='relu')(inputs)
inputs = MaxPooling2D(pool_size=2, strides=2)(inputs)
flat = Flatten()(block2)
dense1 = Dense(1024, activation="relu")(flat)
dense2 = Dense(512, activation="relu")(dense1)
dense2 = Dense(1, activation='sigmoid')(dense2)
res = dense2
return res
NB_EPOCHS = 5
def create_dataset(X, y, batch_size=BATCH_SIZE, nb_epochs=NB_EPOCHS, batch=True):
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.map(my_process_path)
if batch:
dataset = dataset.batch(batch_size)
dataset = dataset.repeat(nb_epochs)
dataset = dataset.prefetch(buffer_size=2)
iterator = tf.data.make_one_shot_iterator(dataset)
#iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
y_ = process_one_batch(next_element[0], next_element[1])
return dataset, next_element, y_
train_ds, (train_x, train_y), prediction = create_dataset(X_train.values, y_train.values)
test_ds, (test_x, test_y), test_prediction = create_dataset(X_test.values, y_test.values, batch=True)
cross_entropy = tf.reduce_mean(tf.keras.losses.binary_crossentropy(
train_y, tf.reshape(tf.transpose(prediction), [-1]), from_logits=True
))
optimiser = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cross_entropy)
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
train_steps = int(len(X_train.values) / BATCH_SIZE)
val_steps = int(len(X_test.values) / BATCH_SIZE)
# initialise the variables
sess.run(init_op)
print('Init')
for epoch in range(NB_EPOCHS):
avg_cost = 0
train_acc = 0
for i in range(train_steps):
_, c, ac = sess.run([optimiser, cross_entropy, accuracy_train])
print(c)
avg_cost += 1
train_acc += ac
print('train_acc: ', train_acc/train_steps)
print(train_steps, val_steps)
print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(avg_cost / train_steps))
print("\nTraining complete!")
I have tried debugging the model input, it's not always the same, but after the 3rd batch cost is stucked at 0.6931472. Just after I run a simple keras model on the dataset and everything works fine, so I don't thinks it's a data related problem
Any idea would be gladly appreciated
Related
I'm trying to implement a Fully Convolutional Neural Network and can successfully test the accuracy of the model on the test set after training. However, I'd like to use the model to make a prediction on a single sample only. Training was in batches. I believe what I'm missing is related to batch size and input shape. Here is the configuration for the network:
def read(file_name):
data = np.loadtxt(file_name, delimiter="\t")
y = data[:, 0]
x = data[:, 1:]
return x, y.astype(int)
train_data, train_labels = read("FordA_TRAIN.tsv")
test_data, test_labels = read("FordA_TEST.tsv")
train_data = train_data.reshape((train_data.shape[0], train_data.shape[1], 1))
test_data = test_data.reshape((test_data.shape[0], test_data.shape[1], 1))
num_classes = len(np.unique(train_labels))
#print(train_data[0])
# Shuffle the data to prepare for validation_split (and prevent overfitting for class order)
idx = np.random.permutation(len(train_data))
train_data = train_data[idx]
train_labels = train_labels[idx]
#Standardize labels to have a value between 0 and 1 rather than -1 and 1.
train_labels[train_labels == -1] = 0
test_labels[test_labels == -1] = 0
def make_model(input_shape):
input_layer = keras.layers.Input(input_shape)
conv1 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(input_layer)
conv1 = keras.layers.BatchNormalization()(conv1)
conv1 = keras.layers.ReLU()(conv1)
conv2 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv1)
conv2 = keras.layers.BatchNormalization()(conv2)
conv2 = keras.layers.ReLU()(conv2)
conv3 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv2)
conv3 = keras.layers.BatchNormalization()(conv3)
conv3 = keras.layers.ReLU()(conv3)
gap = keras.layers.GlobalAveragePooling1D()(conv3)
output_layer = keras.layers.Dense(num_classes, activation="softmax")(gap)
return keras.models.Model(inputs=input_layer, outputs=output_layer)
model = make_model(input_shape=train_data.shape[1:])
keras.utils.plot_model(model, show_shapes=True)
epochs = 500
batch_size = 32
callbacks = [
keras.callbacks.ModelCheckpoint(
"best_model.h5", save_best_only=True, monitor="val_loss"
),
keras.callbacks.ReduceLROnPlateau(
monitor="val_loss", factor=0.5, patience=20, min_lr=0.0001
),
keras.callbacks.EarlyStopping(monitor="val_loss", mode = 'min', patience=50, verbose=1),
]
model.compile(
optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["sparse_categorical_accuracy"],
)
history = model.fit(
train_data,
train_labels,
batch_size=batch_size,
epochs=epochs,
callbacks=callbacks,
validation_split=0.2,
verbose=1,
)
model = keras.models.load_model("best_model.h5")
test_loss, test_acc = model.evaluate(test_data, test_labels)
print("Test accuracy", test_acc)
print("Test loss", test_loss)
The above code can successfully display where the accuracy converged. Now, I'd like to make predictions on single samples. So far I have:
def read(file_name):
data = np.loadtxt(file_name, delimiter="\t")
y = data[:, 0]
x = data[:, 1:]
return x, y.astype(int)
test_data, test_labels = read("FordA_TEST_B.tsv")
test_data = test_data.reshape((test_data.shape[0], test_data.shape[1], 1))
test_labels[test_labels == -1] = 0
print(test_data)
model = keras.models.load_model("forda_original_model.h5")
q = model.predict(test_data[0])
This raises the error: ValueError: Error when checking input: expected input_1 to have 3 dimensions, but got array with shape (500, 1)
How does the input have to be reshaped and what is the rule to go by? Any help is much appreciated!
Copied from a comment:
The model expects a batch dimension. Thus, to predict for a single model, just expand the dimensions to create a single-sized batch by running:
q = model.predict(test_data[0][None,...])
or
q = model.predict(test_data[0][np.newaxis,...])
I'm new to Tensorflow and I'm trying to rebuild a simple network, that I've built in Keras (TF backend), with Tensorflows Python API. It is a simple function approximator (z = sin(x + y)).
I've tried different architectures, optimizers and learning rates, but I'm not getting the new network to train properly. However in my eyes, the networks seem to be identical. Both get the exact same feature vectors and labels:
# making training data
start = 0
end = 2*np.pi
samp = 1000
num_samp = samp**2
step = end / samp
x_train = np.arange(start, end, step)
y_train = np.arange(start, end, step)
data = np.array(np.meshgrid(x_train,y_train)).T.reshape(-1,2)
z_label = np.sin(data[:,0] + data[:,1])
Here is the Keras model:
#start model
model = Sequential()
#stack layers
model.add(Dense(units=128, activation='sigmoid', input_dim=2, name='dense_1'))
model.add(Dense(units=64, activation='sigmoid', input_dim=128, name='dense_2'))
model.add(Dense(units=1, activation='linear', name='output'))
#compile model
model.compile(loss='mean_squared_error',
optimizer='sgd',
metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath='./weights/weights.h5',
verbose=1, save_best_only=True)
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
model.fit(data, z_label, epochs=20, batch_size=32,
shuffle='true',validation_data=(data_val, z_label_val),
callbacks=[checkpointer, tensorboard])
Here is the new network, built with Tensorflows Python API:
# hyperparameter
n_inputs = 2
n_hidden1 = 128
n_hidden2 = 64
n_outputs = 1
learning_rate = 0.01
# construction phase
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='input')
y = tf.placeholder(tf.float32, shape=(None), name="target")
hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", activation=tf.nn.sigmoid)
hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2", activation=tf.nn.sigmoid)
logits = tf.layers.dense(hidden2, n_outputs, activation='linear', name='output')
loss = tf.reduce_mean(tf.square(logits - y), name='loss')
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss, name='train')
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# --- execution phase ---
n_epochs = 40
batch_size = 32
n_batches = int(num_samp/batch_size)
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
print("Epoch: ", epoch, " Running...")
loss_arr = np.array([])
for iteration in range( n_batches ):
start = iteration * batch_size
end = start + batch_size
sess.run(training_op, feed_dict={X: data[start:end], y: z_label[start:end] })
loss_arr = np.append(loss_arr, loss.eval(feed_dict={X: data[start:end, :], y: z_label[start:end]}))
mean_loss = np.mean(loss_arr)
print("Epoch: ", epoch, " Calculated ==> Loss: ", mean_loss)
While the Keras model train properly with a decreasing loss and proper test results, the new model converges pretty fast and stops learning. Accordingly the results are completely useless.
Am I building/training the the model incorrectly or is Keras doing anything in the background, that I'm not aware of?
Solved this issue. The problem was the shape of the label vector. It was a lying vector with shape (1000000,). While Keras is apparently capable of dealing with different shapes of output and label vectors, Tensorflow initialized the placeholder incorrectly and the loss function
loss = tf.reduce_mean(tf.square(logits - y), name='loss')
did't make sense anymore and thus training failed. Adding
z_label = z_label.reshape(-1,1)
reshaped the label vector to (1000000, 1) and solved it. Alternatively one can specify the shape of the placeholder more precisely
y = tf.placeholder(tf.float32, shape=(None,1), name="target")
One of my black beast in Tensorflow is the question of Shape.
I always stack their.
That time I have a multiclassification problem and I need to use one_hot encoding with nn.softmax_cross_entropy_with_logits.
I have tried many solution on the net but I still always get this error:
Cannot feed value of shape (100, 1) for Tensor 'input/Y:0', which has shape '(?,)'
Here is the essential part of my code :
Here where I set my placeholder and I apply tf.one_hot:
with tf.name_scope('input'):
# [BATCH_SIZE, NUM_FEATURES]
self.X=tf.placeholder(dtype=tf.float32, shape=[None,self.n_input_train], name="X")
# [BATCH_SIZE]
self.Y = tf.placeholder(dtype=tf.int32, shape=[None], name='Y')
self.is_train = tf.placeholder(tf.bool, name="is_train")
# [BATCH_SIZE, NUM_CLASSES]
self.Y_onehot = tf.one_hot(indices=self.Y, depth=self.n_classes, on_value=1, off_value=0, name='Y_onehot')
The code stack here in sess.run showing the error above :
for sample in mini_batches:
batch_x = x_train.iloc[sample, :]
batch_y =train_output.iloc[sample, :]
#batch_y = np.reshape(batch_y, (-1))
feed_dict={self.X: batch_x,self.Y:batch_y, self.is_train:True}
self.train_summary, _, cost,acc=self.sess.run([self.merged, self.train_step, self.loss_, self.accuracy_],feed_dict=feed_dict)
avg_cost += cost *len(sample)/n_samples
print('epoch[{}] step [{}] train -- loss : {}, accuracy : {}'.format(epoch,step, avg_cost, acc))
step += 100
My labels look like something like this (it's a vector of one column only containing the values of the factors that represent my classes) :
0
0 108
1 30
2 30
3 16
4 62
5 126
6 22
7 30
8 48
And here how i declare the last output in my model :
# Output fully connected layer with the output
out_layer = tf.layers.dense(inputs=layer_3, units= self.n_classes, use_bias=True, kernel_initializer=self._init, name= 'out_layer')
And those are the diff shapes :
The shape of logits (?, 64)
The shape of Y (?, 64)
The shape of X (?, 14)
The shape of tain_input (847, 14)
The shape of tain_output (847, 1)
The shape of y_batch (100, 1)
Edit:
Here is the model :
def multilayer_perceptron(self,X):
# Hidden fully connected layer with n_hidden_1 neurons
layer_1 = tf.layers.dense(inputs=X, units= self.n_hidden_1, use_bias=True, kernel_initializer=self._init, name= 'layer_1')
layer_1 = tf.layers.batch_normalization(layer_1,training=self.is_train)
layer_1 = self.activation(layer_1)
# Hidden fully connected layer with n_hidden_2 neurons
layer_2 = tf.layers.dense(inputs=layer_1, units= self.n_hidden_2, use_bias=True, kernel_initializer=self._init, name= 'layer_2')
layer_2 = tf.layers.batch_normalization(layer_2,training=self.is_train)
layer_2 = self.activation(layer_2)
# Hidden fully connected layer with n_hidden_3 neurons
layer_3 = tf.layers.dense(inputs=layer_2, units= self.n_hidden_3, use_bias=True, kernel_initializer=self._init, name= 'layer_3')
layer_3 = tf.layers.batch_normalization(layer_3, training=self.is_train)
layer_3 = self.activation(layer_3)
# Output fully connected layer with the output
out_layer = tf.layers.dense(inputs=layer_3, units= self.n_classes, use_bias=True, kernel_initializer=self._init, name= 'out_layer')
tf.summary.histogram('pre-activations', out_layer)
return layer_1, layer_2, layer_3, out_layer
And here is how I calculate the loss:
def loss(self, X, Y):
_, _, _, self.logits = self.multilayer_perceptron(X)
print("The shape of logits ", self.logits.get_shape())
print("The shape of Y ", self.Y.get_shape())
print("The shape of X ", X.get_shape())
with tf.name_scope('loss'):
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=Y))
tf.summary.scalar('loss', loss)
with tf.name_scope('accuracy'):
with tf.name_scope('correct_prediction'):
correct_prediction = tf.equal(tf.argmax(self.logits, 1), tf.argmax(Y, 1))
with tf.name_scope('accuracy'):
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)
self.merged = tf.summary.merge_all()
return loss, accuracy
And here is the where I call the loss function:
def cross_validation(self,batch_size, n_hidden_1 , n_hidden_2, n_hidden_3, learning_rate):
loss = 0
tf.reset_default_graph()
with tf.name_scope('input'):
...
# [BATCH_SIZE]
#self.Y=tf.placeholder(dtype=tf.int64, shape=[None,self.y_train.shape[1]], name="Y")
self.Y = tf.placeholder(dtype=tf.int32, shape=[None], name='Y')
# [BATCH_SIZE, NUM_CLASSES]
...
self.loss_, self.accuracy_ = self.loss(self.X, self.Y_onehot)
self.train_step = self.optimizer(self.learning_rate).minimize(self.loss_)
# Initiate a tensor session
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
#train the model
loss = self.train()
self.sess.close()
del self.sess
return loss
How can I fix that ?
Is their any tips to follow to avoid those problem of shapes?
I did fix the issue finally by using flatten(), it transform 2D array to 1D array :
batch_y =train_output.iloc[sample, :]
batch_y = np.array(batch_y).flatten()
I'm converting a basic LSTM many-to-one architecture to predict the next single element in a sequence, written in Keras to Pytorch. NN architecture is the following (whole code can be found here):
model = Sequential()
model.add(LSTM(
512,
input_shape=(network_input.shape[1], network_input.shape[2]),
return_sequences=True
))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
Running both models with the same data (yes, I've explicitly checked that), both start with a loss value ~ 4, but after 100 epochs or so, Keras already reached a loss ~ 0.02, which gives the desired results.
However, Pytorch model is stuck around ~ 3.4 after 20 epochs. I've tried many things:
Play with LR: It explodes when LR is too high, so this means that at least parameters are being updated.
Different optimizers, SGD, Adam, RMSprop, but same results with all.
Swap between .view[], .squeeze_ and indexing when accessing last sequence element.
Add, remove and modify non-linear activation functions and dropout.
Remove manual initialization for x_0 and h_0.
Here is the code for my model:
class NNP_RNN(nn.Module):
def __init__(self):
super(NNP_RNN, self).__init__()
self.lstm_1 = nn.LSTM(input_size=1, hidden_size=512, batch_first=True)
self.lstm_2 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
self.lstm_3 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
self.dense_1 = nn.Linear(in_features=512, out_features=256)
self.dense_2 = nn.Linear(in_features=256, out_features=58)
def forward(self, x):
batch_size = x.size(0)
h_0 = NNP_RNN.init_hidden((1, batch_size, 512))
c_0 = NNP_RNN.init_hidden((1, batch_size, 512))
x, _ = self.lstm_1(x, (h_0, c_0))
x = F.dropout(x, 0.3)
x, _ = self.lstm_2(x, (h_0, c_0))
x = F.dropout(x, 0.2)
_, (x, _) = self.lstm_3(x, (h_0, c_0))
x = x.squeeze_(0)
x = self.dense_1(x)
x = F.dropout(x, 0.1)
x = self.dense_2(x)
return x
#staticmethod
def init_hidden(dims):
return torch.zeros(dims, device=device)
And the training process:
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, verbose=True, patience=5)
criterion = nn.CrossEntropyLoss()
for epoch in range(1, epochs + 1):
epoch_loss = 0
epoch_corrects = 0
for features, labels in tqdm(data, ncols=800):
features = features.to(device)
labels = labels.to(device)
optimizer.zero_grad()
batch_size = features.size(0)
output = model(features)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
corrects = torch.argmax(output, dim=1)
corrects = torch.eq(corrects, labels).sum().item()
epoch_corrects += corrects
epoch_loss += loss.clone() * batch_size
epoch_loss /= len(data.dataset)
epoch_corrects /= len(data.dataset)
print(f'Loss epoch #{epoch} = {epoch_loss:.10f}, Accuracy = {epoch_corrects}')
scheduler.step(epoch_loss)
For several days now, I'm trying to replicate my keras training results with pytorch. Whatever I do, the pytorch model will overfit far earlier and stronger to the validation set then in keras. For pytorch I use the same XCeption Code from https://github.com/Cadene/pretrained-models.pytorch.
The dataloading, the augmentation, the validation, the training schedule etc. are equivalent. Am I missing something obvious? There must be a general problem somewhere. I tried thousands of different module constellations, but nothing seems to come even close to the keras training. Can somebody help?
Keras model: val accuracy > 90%
# base model
base_model = applications.Xception(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))
# top model
x = base_model.output
x = GlobalMaxPooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(4, activation='softmax')(x)
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)
# Compile model
from keras import optimizers
adam = optimizers.Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy',
optimizer=adam, metrics=['accuracy'])
# LROnPlateau etc. with equivalent settings as pytorch
Pytorch model: val accuracy ~81%
from xception import xception
import torch.nn.functional as F
# modified from https://github.com/Cadene/pretrained-models.pytorch
class XCeption(nn.Module):
def __init__(self, num_classes):
super(XCeption, self).__init__()
original_model = xception(pretrained="imagenet")
self.features=nn.Sequential(*list(original_model.children())[:-1])
self.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)
def logits(self, features):
x = F.relu(features)
x = F.adaptive_max_pool2d(x, (1, 1))
x = x.view(x.size(0), -1)
x = self.last_linear(x)
return x
def forward(self, input):
x = self.features(input)
x = self.logits(x)
return x
device = torch.device("cuda")
model=XCeption(len(class_names))
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model)
model.to(device)
criterion = nn.CrossEntropyLoss(size_average=False)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
Thank you very much!
Update:
Settings:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5, cooldown=5)
model = train_model(model, train_loader, val_loader,
criterion, optimizer, scheduler,
batch_size, trainmult=8, valmult=10,
num_epochs=200, epochs_top=0)
Cleaned training function:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, batch_size, trainmult=1, valmult=1, num_epochs=None, epochs_top=0):
for epoch in range(num_epochs):
for phase in ['train', 'val']:
running_loss = 0.0
running_acc = 0
total = 0
# Iterate over data.
if phase=="train":
model.train(True) # Set model to training mode
for i in range(trainmult):
for data in train_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs) # notinception
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
loss.backward()
optimizer.step()
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
train_loss=(running_loss/total)
train_acc=(running_acc.double()/total)
else:
model.train(False) # Set model to evaluate mode
with torch.no_grad():
for i in range(valmult):
for data in val_loader:
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels.data)
# statistics
total += labels.size(0)
running_loss += loss.item()*labels.size(0)
running_acc += torch.sum(preds == labels)
val_loss=(running_loss/total)
val_acc=(running_acc.double()/total)
scheduler.step(val_loss)
return model
it may be because type of weight initialization you are using
otherwise this should not happen
try with same initializer in both the models
self.features=nn.Sequential(*list(original_model.children())[:-1])
Are you sure that this line re-instantiates your model in exactly the same way? You're using a NN.Sequential instead of the original XCeption model's forward function. If there's anything in that forward function that isn't the exact same as using a nn.Sequential, it will not reproduce the same performance.
Instead of wrapping it in a Sequential, you could just change this
my_model = Xception()
# load weights before you change the architecture
my_model = load_weights(path_to_weights)
# overwrite the original's last_linear with your own
my_model.last_linear = nn.Sequential(
nn.Linear(original_model.last_linear.in_features, 512),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(512, num_classes)
)