I'm using Tensorflow to train a network to predict the third item in a list of numbers.
When I train, the network appears to train quite well and do well on both the training and test set. However, when I evaluate its performance myself, it seems to be doing quite poorly.
For example, at the end of training, Tensorflow says that the validation loss is 2.1 x 10^(-5). However, when I compute it myself, I get 0.17 x 10^0. What am I doing wrong?
Here's code that can be run on Google Colab:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
def create_dataset(k=5, n=2, example_amount=200):
'''Create a dataset of numbers where the goal is to always output the nth number'''
# UPGRADE: this could be done better with numpy to just generate all the examples at once
example_amount = 1000
x = []
y = []
ans = [x, y]
for i in range(example_amount):
example_x = np.random.rand(k)
example_y = example_x[n]
x.append(example_x)
y.append(example_y)
return ans
def tensorize(tensor_like) -> tf.Tensor:
'''Turn stuff into tensors'''
return tf.convert_to_tensor(tensor_like, dtype=tf.float32)
def split_dataset(dataset, train_split=0.8, random_state=42):
'''
Takes in a list (or tuple) where index 0 contains the inputs and index 1 contains the outputs
outputs x_train, x_test, y_train, y_test, train_indexes, test_indexes all as tf.Tensor
'''
indices = np.arange(len(dataset[0]))
return tuple([tensorize(data) for data in train_test_split(dataset[0], dataset[1], indices, train_size=train_split, random_state=random_state)])
# how many numbers in each example
K = 5
# the index of the solution
N = 2
# how many examples
EXAMPLE_AMOUNT = 20000
# what percentage of the examples are in the training set
TRAIN_SPLIT = 0.5
# how long to train for
epochs = 50
dataset = create_dataset(K, N, EXAMPLE_AMOUNT)
x_train, x_test, y_train, y_test, train_indexes, test_indexes = split_dataset(dataset, train_split=TRAIN_SPLIT)
model_input = tf.keras.layers.Input(shape=(K,), name="input")
model_dense1 = tf.keras.layers.Dense(10, name="dense1")(model_input)
model_dense2 = tf.keras.layers.Dense(10, name="dense2")(model_dense1)
model_output = tf.keras.layers.Dense(1, name="output")(model_dense2)
model = tf.keras.Model(inputs=model_input, outputs=model_output)
model.compile(optimizer=tf.keras.optimizers.Adam(), loss="mse")
history = model.fit(x=x_train, y=y_train, validation_data=(x_test, y_test), epochs=epochs)
# the validation loss as Tensorflow computes it
print(history.history["val_loss"][-1]) # 2.1036579710198566e-05
# the validation loss as I compute it
val_loss = tf.math.reduce_mean(tf.keras.losses.MSE(y_test, model.predict(x_test))).numpy()
print(val_loss) # 0.1655631
What you miss is that the shape of y_test.
y_test.numpy().shape
(500,) <-- causing the behaviour
Simply reshape it like:
val_loss = tf.math.reduce_mean(tf.keras.losses.MSE(y_test.numpy().reshape(-1,1), model.predict(x_test))).numpy()
print(val_loss) # 1.1548506e-05
Also:
history.history["val_loss"][-1] # 1.1548506336112041e-05
Or you can flatten() both of the data while calculating it:
val_loss = tf.math.reduce_mean(tf.keras.losses.MSE(y_test.numpy().flatten(), model.predict(x_test).flatten())).numpy()
print(val_loss) # 1.1548506e-05
Related
I am training a keras model to perform some simple categorisation tasks. In my case, the model needs to learn how to make a judgement based on the task cue and a given task stimulus. The task stimulus is an array of 5 numbers, which is randomly generated. Here, to train the model, I don't need the epoch more than 1, as learning a specific stimulus is not the goal. Thus, I set up the epoch as 1. However, I don't want the model to have a 100% accuracy on the validation dataset, but 80%.
To achieve this goal, I used callback function to stop training when the training accuracy reached 80%. But then I found the accuracy on the validation dataset is much better than the training accuracysee here. As I only have one epoch here, how should I setup the callback function to make sure the model has 80% accuracy on the validation dataset? Thanks in advance!
Here are codes:
import numpy as np
import random
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
# random seed
from numpy.random import seed
seed(1234)
random.seed(10)
# set up a simple categorization task
def tasksets(train_num, test_num, task_num):
x_train, y_train, rules_train = train_sequence(train_num, task_num)
x_test, y_test, rules_test = train_sequence(test_num, task_num)
return x_train, x_test, rules_train, y_train, y_test, rules_test
# generating training sequence
def train_sequence(trial_num, task_num):
x = np.zeros((trial_num, task_num), dtype=np.float64)
y = np.zeros(trial_num, dtype=np.float64)
rules = np.zeros(trial_num, dtype=np.float64)
rulepool = []
for r in range(task_num):
rulepool = rulepool + [r]*int(trial_num/task_num)
random.shuffle(rulepool)
for i in range(trial_num):
for t in range(task_num):
x[i,t] = random.random() # multi-dimentional stimuli
rule_idx = rulepool.pop(random.randint(0, len(rulepool)-1))
rules[i] = rule_idx
if x[i, rule_idx] <= 0.5:
answer = 0 # no
elif x[i, rule_idx] > 0.5:
answer = 1 # yes
y[i] = answer
x = np.reshape(x, (trial_num,task_num))
y = tf.one_hot(y, 2)
rules = np.reshape(rules, (trial_num))
rules = tf.one_hot(rules, depth = task_num)
return x, y, rules
def build_network(task_num, learning_rate: float = 0.001):
# nsteps = 1
input_dims = task_num
inputs_stimuli = Input(shape=(input_dims), name = "stimulus")
inputs_rule = Input(shape=(input_dims), name = "rule")
hid1 = Dense(6, activation='relu', name = "stimulus_representation")(inputs_stimuli)
hid2 = Dense(6, activation='relu', name = "rule_representation")(inputs_rule) # back to dense
fuse = keras.layers.concatenate([hid1, hid2]) # combine multiple inputs
decision = Dense(100, activation = "relu", name = "decision")(fuse) # back to dense
output = Dense(2, activation="softmax", name = "output")(decision)
model = Model(inputs=[inputs_stimuli, inputs_rule], outputs=output)
loss = tf.keras.losses.CategoricalCrossentropy()
model.compile(optimizer = \
tf.keras.optimizers.Adam(learning_rate = learning_rate), loss = loss, metrics = ["accuracy"])
return model
# Instantiate a callback object
accuracy_threshold = 0.80
class myCallback(tf.keras.callbacks.Callback):
def on_train_batch_end(self, batch, logs={}):
keys = list(logs.keys())
print("...Training: start of batch {}; got log keys: {}".format(batch, keys))
if(logs.get('accuracy') > accuracy_threshold):
print("\nReached %2.2f%% accuracy, so stopping training!!" %(accuracy_threshold*100))
self.model.stop_training = True
callbacks = myCallback()
# the model is trained until it reaches 80% accuracy in the test
check = 0
test_threshold = 1
task_num = 5
batch_size = 8
model = build_network(task_num)
x_train, x_test, rule_train, y_train, y_test, rule_test = tasksets(10000*task_num, 100*task_num, task_num)
results = model.fit([x_train, rule_train], y_train, epochs = 1,
batch_size = batch_size, callbacks=callbacks,
validation_data = ([x_test, rule_test], y_test))
I am trying to integrate a Keras deep neural network as a classifier within code for sequential backward feature selection in Python. (Originally, I tried to wrap the Keras deep neural network within Scikeras to use within scikit-learn's built in sequential feature selection models, but I kept getting error messages).
I found this code from scratch for sequential backward feature selection (taken from https://vitalflux.com/sequential-backward-feature-selection-python-example/), and have been trying to integrate a Keras model in the code to replace the "estimator" within the function but I keep getting this error: ValueError: Input 0 of layer "sequential_410" is incompatible with the layer: expected shape=(None, 45), found shape=(None, 44)
Here is the code that I have so far for the sequential backward feature selection and the deep neural network:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier, KerasRegressor
# SBS (sequential backward feature selection) from scratch
#=====================================================
from sklearn.metrics import accuracy_score
from itertools import combinations
from sklearn.base import clone
class SequentialBackwardSearch():
'''
Instantiate with Estimator and given number of features
'''
def __init__(self, estimator, k_features):
self.estimator = clone(estimator)
self.k_features = k_features
'''
X_train - Training data Pandas dataframe
X_test - Test data Pandas dataframe
y_train - Training label Pandas dataframe
y_test - Test data Pandas dataframe
'''
def fit(self, X_train, X_test, y_train, y_test):
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train.values, X_test.values,
y_train.values, y_test.values, self.indices_)
self.scores_ = [score]
'''
Iterate through all the dimensions until k_features is reached
At the end of loop, dimension count is reduced by 1
'''
while dim > k_features:
scores = []
subsets = []
'''
Iterate through different combinations of features, train the model,
record the score
'''
for p in combinations(self.indices_, r=dim - 1):
score = self._calc_score(X_train.values, X_test.values, y_train.values, y_test.values, p)
scores.append(score)
subsets.append(p)
#
# Get the index of best score
#
best_score_index = np.argmax(scores)
#
# Record the best score
#
self.scores_.append(scores[best_score_index])
#
# Get the indices of features which gave best score
#
self.indices_ = subsets[best_score_index]
#
# Record the indices of features for best score
#
self.subsets_.append(self.indices_)
dim -= 1 # Dimension is reduced by 1
'''
Transform training, test data set to the data set
havng features which gave best score
'''
def transform(self, X):
return X.values[:, self.indices_]
'''
Train models with specific set of features
indices - indices of features
'''
def _calc_score(self, X_train, X_test, y_train, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train.ravel())
y_pred = self.estimator.predict(X_test[:, indices])
score = accuracy_score(y_test, y_pred)
return score
# ===============================================
# Keras deep neural network
def dnn():
model = keras.Sequential([
layers.Dense(20, activation='relu', input_shape = (X_train.shape[1])),
layers.Dropout(0.3),
layers.Dense(20, activation='relu'),
layers.Dropout(0.3),
layers.Dense(1, activation='sigmoid'),
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['binary_accuracy']
)
early_stopping = keras.callbacks.EarlyStopping(
patience=5,
min_delta=0.001,
restore_best_weights=True,
)
history = model.fit(
X_train, y_train,
validation_data=(X_test, y_test),
batch_size=512,
callbacks=[early_stopping],
)
history_df = pd.DataFrame(history.history)
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")
return model
keras_clf = KerasClassifier(dnn,
epochs=5,
verbose=False)
keras_clf._estimator_type = "classifier"
And this is the code I have for trying to integrate them together:
k_features = 5
#
# Instantiate SequentialBackwardSearch
#
sbs = SequentialBackwardSearch(keras_clf, k_features)
#
# Fit the data to determine the k_features which give the
# most optimal model performance
#
sbs.fit(X_train, X_test, y_train, y_test)
#
# Transform the training data set to dataset having k_features
# giving most optimal model performance
#
X_train_kfeatures = sbs.transform(X_train)
#
# Transform the test data set to dataset having k_features
#
X_test_kfeatures = sbs.transform(X_test)
sbs.indices_
X_train.columns[[sbs.indices_]] # sbs is an instance of SequentialBackwardSearch class
I am wondering whether this is even possible (integrating a neural network to the existing code for sequential backward feature selection) or if there's anything I can do to get it to run and output the top 5 features from the training dataset. I have tried to address the error message by altering the input shape of the neural network, but I believe it is correct (45 features). Any help or advice would be welcome!
This should work with SciKeras!
I had to clean up your code / fix some bugs. I first did a "sanity check" using Scikit-Learn's MLPClassfier, then I ran it against an MLPClassfier created using Keras. Details may differ for more complex model architectures, but this shows that it does work.
import numpy as np
# SBS (sequential backward feature selection) from scratch
#=====================================================
from sklearn.metrics import accuracy_score
from itertools import combinations
from sklearn.base import clone
class SequentialBackwardSearch:
'''
Instantiate with Estimator and given number of features
'''
def __init__(self, estimator, k_features):
self.estimator = clone(estimator)
self.k_features = k_features
'''
X_train - Training data Pandas dataframe
X_test - Test data Pandas dataframe
y_train - Training label Pandas dataframe
y_test - Test data Pandas dataframe
'''
def fit(self, X_train, X_test, y_train, y_test):
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, X_test,
y_train, y_test, self.indices_)
self.scores_ = [score]
'''
Iterate through all the dimensions until k_features is reached
At the end of loop, dimension count is reduced by 1
'''
while dim > self.k_features:
scores = []
subsets = []
'''
Iterate through different combinations of features, train the model,
record the score
'''
for p in combinations(self.indices_, r=dim - 1):
score = self._calc_score(X_train, X_test, y_train, y_test, p)
scores.append(score)
subsets.append(p)
#
# Get the index of best score
#
best_score_index = np.argmax(scores)
#
# Record the best score
#
self.scores_.append(scores[best_score_index])
#
# Get the indices of features which gave best score
#
self.indices_ = subsets[best_score_index]
#
# Record the indices of features for best score
#
self.subsets_.append(self.indices_)
dim -= 1 # Dimension is reduced by 1
'''
Transform training, test data set to the data set
havng features which gave best score
'''
def transform(self, X):
return X.values[:, self.indices_]
'''
Train models with specific set of features
indices - indices of features
'''
def _calc_score(self, X_train, X_test, y_train, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train.ravel())
y_pred = self.estimator.predict(X_test[:, indices])
score = accuracy_score(y_test, y_pred)
return score
# Sklearn MLPClassifier
from sklearn.neural_network import MLPClassifier
estimator = MLPClassifier()
search = SequentialBackwardSearch(estimator, 1)
X = np.random.randint(0, 2, size=(100, 5))
y = X[:, -1]
search.fit(X, X, y, y)
assert list(search.indices_) == [4]
# SciKeras MLPClassifier
# see https://www.adriangb.com/scikeras/stable/notebooks/MLPClassifier_MLPRegressor.html
import tensorflow.keras as keras
from scikeras.wrappers import KerasClassifier
class KerasMLPClassifier(KerasClassifier):
def __init__(
self,
hidden_layer_sizes=(100, ),
optimizer="adam",
optimizer__learning_rate=0.001,
epochs=200,
verbose=0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_layer_sizes = hidden_layer_sizes
self.optimizer = optimizer
self.epochs = epochs
self.verbose = verbose
def _keras_build_fn(self, compile_kwargs):
model = keras.Sequential()
inp = keras.layers.Input(shape=(self.n_features_in_))
model.add(inp)
for hidden_layer_size in self.hidden_layer_sizes:
layer = keras.layers.Dense(hidden_layer_size, activation="relu")
model.add(layer)
if self.target_type_ == "binary":
n_output_units = 1
output_activation = "sigmoid"
loss = "binary_crossentropy"
elif self.target_type_ == "multiclass":
n_output_units = self.n_classes_
output_activation = "softmax"
loss = "sparse_categorical_crossentropy"
else:
raise NotImplementedError(f"Unsupported task type: {self.target_type_}")
out = keras.layers.Dense(n_output_units, activation=output_activation)
model.add(out)
model.compile(loss=loss, optimizer=compile_kwargs["optimizer"])
return model
estimator2 = KerasMLPClassifier()
search2 = SequentialBackwardSearch(estimator2, 1)
search2.fit(X, X, y, y)
assert list(search2.indices_) == [4]
Notebook version (can't promise this will be around forever): https://colab.research.google.com/drive/1EWxT3GWZsqhftz4f7W5GsXNe_SPtva4H#scrollTo=chU7wLn1BTU1
I have been working with binary sequential inputs and outputs using Tensorflow 2.0, and I've been wondering which approach Tensorflow uses to compute metrics such as recall or accuracy during training in those scenarios.
Each sample to my network consists of 60 timesteps, each with 300 features, and thus my expected output is a (60, 1) array of 1s and 0s. Suppose I have 2000 validation samples. When evaluating the validation set for each epoch, does tensorflow concatenates all of the 2000 samples into a single (2000*60=120000, 1) array and then compares to the concatenated groundtruth labels, or does it evalutes each of the (60, 1) individually and then returns a mean of those values? Is there any way to modify this behavior?
Tensorflow/Keras by default computes the metrics batch-wise for train data, while it computes the same metrics on ALL the data passed in validation_data parameters in fit method.
This means that the metric printed during fitting for the train data is the mean of that score calculated on all the batches. In other words, for trainset keras evaluates each bach individually and then returns a mean of those values. For validation data is different, keras gets all the validation samples and then compares them with the "concatenated" groundtruth labels.
To prove this behavior with code I propose a dummy example. I provide a custom callback that computes for sure the accuracy score on ALL the data passed at the end of the epoch (for train and optionally validation). this is useful for us to understand the behavior of tensorflow during training.
import numpy as np
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
class ACC_custom(tf.keras.callbacks.Callback):
def __init__(self, train, validation=None):
super(ACC_custom, self).__init__()
self.validation = validation
self.train = train
def on_epoch_end(self, epoch, logs={}):
logs['ACC_score_train'] = float('-inf')
X_train, y_train = self.train[0], self.train[1]
y_pred = (self.model.predict(X_train).ravel()>0.5)+0
score = accuracy_score(y_train.ravel(), y_pred)
if (self.validation):
logs['ACC_score_val'] = float('-inf')
X_valid, y_valid = self.validation[0], self.validation[1]
y_val_pred = (self.model.predict(X_valid).ravel()>0.5)+0
val_score = accuracy_score(y_valid.ravel(), y_val_pred)
logs['ACC_score_train'] = np.round(score, 5)
logs['ACC_score_val'] = np.round(val_score, 5)
else:
logs['ACC_score_train'] = np.round(score, 5)
create dummy data
x_train = np.random.uniform(0,1, (1000,60,10))
y_train = np.random.randint(0,2, (1000,60,1))
x_val = np.random.uniform(0,1, (500,60,10))
y_val = np.random.randint(0,2, (500,60,1))
fit model
inp = Input(shape=((60,10)), dtype='float32')
x = Dense(32, activation='relu')(inp)
out = Dense(1, activation='sigmoid')(x)
model = Model(inp, out)
es = EarlyStopping(patience=10, verbose=1, min_delta=0.001,
monitor='ACC_score_val', mode='max', restore_best_weights=True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x_train,y_train, epochs=10, verbose=2,
callbacks=[ACC_custom(train=(x_train,y_train),validation=(x_val,y_val)),es],
validation_data=(x_val,y_val))
in the graphs below I make a comparison between the accuracies computed by our callback and the accuracy computed by keras
plt.plot(history.history['ACC_score_train'], label='accuracy_callback_train')
plt.plot(history.history['accuracy'], label='accuracy_default_train')
plt.legend(); plt.title('train accuracy')
plt.plot(history.history['ACC_score_val'], label='accuracy_callback_valid')
plt.plot(history.history['val_accuracy'], label='accuracy_default_valid')
plt.legend(); plt.title('validation accuracy')
as we can see the accuracy on the train data (first plot) is different between the default method and our callbacks. this means that the accuracy of train data is calculated batch-wise.
the validation accuracy (second plot) calculated by our callback and the default method is the same! this means that the score on validation data is computed one-shoot
I'm hoping to get clarification on how the shuffle argument in tf.data.Dataset.list_files() works. The documentation states that when shuffle=True, the filenames will be shuffled randomly. I've made model predictions using a tfrecords dataset that has been loaded using tf.data.Dataset.list_files(), and I would've expected the accuracy metric to be the same no matter the order of the files (i.e. whether shuffle is True or False), but am seeing otherwise.
Is this expected behavior or is there something wrong with my code or intepretation? I have reproducible example code below.
Oddly, as long as tf.random.set_random_seed() is set initially (and it seems it doesn't even matter what seed value is set), then the predictions results are the same no matter whether shuffle is True or False in list_files().
tensorflow==1.13.1, keras==2.2.4
Thanks for any clarifications!
Edit: re-thinking it through and wondering if Y = [y[0] for _ in range(steps) for y in sess.run(Y)] is a separate and independent call?
# Fit and Save a Dummy Model
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn import datasets, metrics
seed = 7
np.random.seed(seed)
tf.random.set_random_seed(seed)
dataset = datasets.load_iris()
X = dataset.data
Y = dataset.target
dummy_Y = np_utils.to_categorical(Y)
# 150 rows
print(len(X))
model = Sequential()
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, dummy_Y, epochs=10, batch_size=10, verbose=2)
model.save('./iris/iris_model')
predictions = model.predict(X)
predictions = np.argmax(predictions, axis=1)
# returns accuracy = 0.3466666666666667
print(metrics.accuracy_score(y_true=Y, y_pred=predictions))
Split dataset into multiple tfrecords files so we can reload it with list_files() later:
numrows = 15
for i, j in enumerate(range(0, len(X), numrows)):
with tf.python_io.TFRecordWriter('./iris/iris{}.tfrecord'.format(i)) as writer:
for x, y in zip(X[j:j+numrows, ], Y[j:j+numrows, ]):
features = tf.train.Features(feature=
{'X': tf.train.Feature(float_list=tf.train.FloatList(value=x)),
'Y': tf.train.Feature(int64_list=tf.train.Int64List(value=[y]))
})
example = tf.train.Example(features=features)
writer.write(example.SerializeToString())
At this point, I exit (ipython) and restart again:
import numpy as np
import tensorflow as tf
from keras.models import load_model
from sklearn import metrics
model = load_model('./iris/iris_model')
batch_size = 10
steps = int(150/batch_size)
file_pattern = './iris/iris*.tfrecord'
feature_description = {
'X': tf.FixedLenFeature([4], tf.float32),
'Y': tf.FixedLenFeature([1], tf.int64)
}
def _parse_function(example_proto):
return tf.parse_single_example(example_proto, feature_description)
def load_data(filenames, batch_size):
raw_dataset = tf.data.TFRecordDataset(filenames)
dataset = raw_dataset.map(_parse_function)
dataset = dataset.batch(batch_size, drop_remainder=True)
dataset = dataset.prefetch(2)
iterator = dataset.make_one_shot_iterator()
record = iterator.get_next()
return record['X'], record['Y']
def get_predictions_accuracy(filenames):
X, Y = load_data(filenames=filenames, batch_size=batch_size)
predictions = model.predict([X], steps=steps)
predictions = np.argmax(predictions, axis=1)
print(len(predictions))
with tf.Session() as sess:
Y = [y[0] for _ in range(steps) for y in sess.run(Y)]
print(metrics.accuracy_score(y_true=Y, y_pred=predictions))
# No shuffle results:
# Returns expected accuracy = 0.3466666666666667
filenames_noshuffle = tf.data.Dataset.list_files(file_pattern=file_pattern, shuffle=False)
get_predictions_accuracy(filenames_noshuffle)
# Shuffle results, no seed value set:
# Returns UNEXPECTED accuracy (non-deterministic value)
filenames_shuffle_noseed = tf.data.Dataset.list_files(file_pattern=file_pattern, shuffle=True)
get_predictions_accuracy(filenames_shuffle_noseed)
# Shuffle results, seed value set:
# Returns expected accuracy = 0.3466666666666667
# It seems like it doesn't even matter what seed value you set, as long as you you set it
seed = 1000
tf.random.set_random_seed(seed)
filenames_shuffle_seed = tf.data.Dataset.list_files(file_pattern=file_pattern, shuffle=True)
get_predictions_accuracy(filenames_shuffle_seed)
I'm currently undertaking my first 'real' DL project of (surprise) predicting stock movements. I know that I'm 1000:1 to make anything useful but I'm enjoying it and want to see it through, I've learnt more in my few weeks of attempting this than I have in the prior 6 months of completing MOOC's.
I'm building an LSTM using Keras to currently predict the next 1 step forward and have attempted the task as both classification (up/down/steady) and now as a regression problem. Both result in a similar roadblock in that my validation loss never improves from epoch #1.
I can get the model to overfit such that training loss approaches zero with MSE (or 100% accuracy if classification), but at no stage does the validation loss decrease. This screams overfitting to my untrained eye so I added varying amounts of dropout but all that does is stifle the learning of the model/training accuracy and shows no improvements on the validation accuracy.
I have attempted to change a significant number of hyperparameters - learning rate, optimiser, batchsize, lookback window, #layers, #units, dropout, #samples, etc, also tried with subset of data and subset of features but I just can't get it to work so I'm very thankful for any help.
Code Below (it's not pretty I know):
# Import saved full dataframe ~ 200 features
import feather
df = feather.read_dataframe('df_feathered')
df.set_index('time', inplace=True)
# Difference the dataset to make stationary
df = df.diff(periods=1, axis=0)
# MAKE LARGE SAMPLE FOR TESTING
df_train = df.loc['2017-3-1':'2017-6-30']
df_val = df.loc['2017-7-1':'2017-8-31']
df_test = df.loc['2017-9-1':'2017-9-30']
# Make x_train, x_val sets by dropping target variable
x_train = df_train.drop('close+1', axis=1)
x_val = df_val.drop('close+1', axis=1)
# Scale the training data first then fit the transform to the test set
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_val)
# scaler = MinMaxScaler(feature_range=(0,1))
# x_train = scaler.fit_transform(df_train1)
# x_test = scaler.transform(df_val1)
# Create y_train, y_test, simply target variable for regression
y_train = df_train['close+1']
y_test = df_val['close+1']
# Define Lookback window for LSTM input
sliding_window = 15
# Convert x_train, x_test, y_train, y_test into 3d array (samples,
timesteps, features) for LSTM input
dataXtrain = []
for i in range(len(x_train)-sliding_window-1):
a = x_train[i:(i+sliding_window), 0:(x_train.shape[1])]
dataXtrain.append(a)
dataXtest = []
for i in range(len(x_test)-sliding_window-1):
a = x_test[i:(i+sliding_window), 0:(x_test.shape[1])]
dataXtest.append(a)
dataYtrain = []
for i in range(len(y_train)-sliding_window-1):
dataYtrain.append(y_train[i + sliding_window])
dataYtest = []
for i in range(len(y_test)-sliding_window-1):
dataYtest.append(y_test[i + sliding_window])
# Make data the divisible by a variety of batch_sizes for training
# Started at 1000 to not include replaced NaN values
dataXtrain = np.array(dataXtrain[1000:172008])
dataYtrain = np.array(dataYtrain[1000:172008])
dataXtest = np.array(dataXtest[1000:83944])
dataYtest = np.array(dataYtest[1000:83944])
# Checking input shapes
print('dataXtrain size is: {}'.format((dataXtrain).shape))
print('dataXtest size is: {}'.format((dataXtest).shape))
print('dataYtrain size is: {}'.format((dataYtrain).shape))
print('dataYtest size is: {}'.format((dataYtest).shape))
### ACTUAL LSTM MODEL
batch_size = 256
timesteps = dataXtrain.shape[1]
features = dataXtrain.shape[2]
# Model set-up, stacked 4 layer stateful LSTM
model = Sequential()
model.add(LSTM(512, return_sequences=True, stateful=True,
batch_input_shape=(batch_size, timesteps, features)))
model.add(LSTM(256,stateful=True, return_sequences=True))
model.add(LSTM(256,stateful=True, return_sequences=True))
model.add(LSTM(128,stateful=True))
model.add(Dense(1, activation='linear'))
model.summary()
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=5, min_lr=0.000001, verbose=1)
def coeff_determination(y_true, y_pred):
from keras import backend as K
SS_res = K.sum(K.square( y_true-y_pred ))
SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
return ( 1 - SS_res/(SS_tot + K.epsilon()) )
model.compile(loss='mse',
optimizer='nadam',
metrics=[coeff_determination,'mse','mae','mape'])
history = model.fit(dataXtrain, dataYtrain,validation_data=(dataXtest, dataYtest),
epochs=100,batch_size=batch_size, shuffle=False, verbose=1, callbacks=[reduce_lr])
score = model.evaluate(dataXtest, dataYtest,batch_size=batch_size, verbose=1)
print(score)
predictions = model.predict(dataXtest, batch_size=batch_size)
print(predictions)
import matplotlib.pyplot as plt
%matplotlib inline
#plt.plot(history.history['mean_squared_error'])
#plt.plot(history.history['val_mean_squared_error'])
plt.plot(history.history['coeff_determination'])
plt.plot(history.history['val_coeff_determination'])
#plt.plot(history.history['mean_absolute_error'])
#plt.plot(history.history['mean_absolute_percentage_error'])
#plt.plot(history.history['val_mean_absolute_percentage_error'])
#plt.title("MSE")
plt.ylabel("R2")
plt.xlabel("epoch")
plt.legend(["train", "val"], loc="best")
plt.show()
plt.plot(history.history["loss"][5:])
plt.plot(history.history["val_loss"][5:])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "val"], loc="best")
plt.show()
plt.figure(figsize=(20,8))
plt.plot(dataYtest)
plt.plot(predictions)
plt.title("Prediction")
plt.ylabel("Price")
plt.xlabel("Time")
plt.legend(["Truth", "Prediction"], loc="best")
plt.show()
Maybe you should remember you are predicting sock returns, which it's very likely to predict nothing. So val_loss increasing is not overfitting at all. Instead of adding more dropouts, maybe you should think about adding more layers to increase it's power.
Try to reduce learning rate much (and remove dropouts for now).
Why do you use
shuffle=False
in fit() function?