The following code is a very simple example of using word embedding to predict the labels (see below). The example is taken from here.
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
# define documents
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!',
'Weak',
'Poor effort!',
'not good',
'poor work',
'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))
Let us say we have structured data like this:
hours_of_revision = [10, 5, 7, 3, 100, 0, 1, 0.5, 4, 0.75]
Here every entry aligns with each row showing nicely that one should really spend more time to revise to achieve good marks (-:
Just wondering, could one incorporate this into the model to use the text and structured data?
Yes, this is possible with the Functional API from Keras. All you need is an additional input for the hours_of_revision that is concatenated with the embeddings from the text data before going to the final classifier.
Scale the additional data first:
# additional data
hours_of_revision = [10, 5, 7, 3, 100, 0, 1, 0.5, 4, 0.75]
import numpy as np
# Scale the data
mean = np.mean(hours_of_revision)
std = np.std(hours_of_revision)
hours_of_revision = (hours_of_revision - mean)/std
Build the model using the Functional API:
# Build model
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from keras.models import Model
# Two input layers
integer_input = Input((max_length, ))
revision_input = Input((1,))
# Embedding layer for the words
embedding = Embedding(vocab_size, 8, input_length=max_length)(integer_input)
embedding_flat = Flatten()(embedding)
# Concatenate embedding with revision
combined_data = Concatenate()([embedding_flat, revision_input])
output = Dense(1, activation='sigmoid')(combined_data)
# compile the model - pass a list of input tensors
model = Model(inputs=[integer_input, revision_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# fit the model - pass list of input data
model.fit([padded_docs, hours_of_revision], labels, epochs=50, verbose=0)
For more examples on how to use the Functional API for multi-input/multi-output models, have a look at the Keras docs.
Related
I am trying to learn a language model to predict the last word of a sentence given all the previous words using keras. I would like to embed my inputs using a learned fasttext embedding model.
I managed to preprocess my text data and embed the using fasttext. My training data is comprised of sentences of 40 tokens each. I created 2 np arrays, X and y as inputs, with y what I want to predict.
X is of shape (44317, 39, 300) with 44317 the number of example sentences, 39 the number of tokens in each sentence, and 300 the dimension of the word embedding.
y is of shape (44317, 300) is for each example the embedding of the last token of the sentence.
My code for the keras model goes as follow (inspired by this)
#importing all the needed tensorflow.keras components
model = Sequential()
model.add(InputLayer((None, 300)))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(300, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=20)
model.save('model.h5')
However, the accuracy I get while training on this model is extremely low (around 1.5%). I think there is some component of the keras model that I misundertood, as if I don't embed my inputs and add an extra embedding layer instead of the InputLayer I get an accuracy of about 60 percents.
My main doubt is the value of "300" on my second Dense layer, as I read that this should correspond the vocabulary size of my word embedding model (which is 48000), however if I put anything else than 300 I get a dimension error. So I understand that I'm doing something wrong, but I can't find how to fix it.
PS :
I have also tried y = to_categorical(y, num_classes=vocab_size) with vocab_size the vocabulary size of my word embedding, and by changing 300 by this same value in the second Dense, however then it tries to create an array of shape(13295100, 48120) instead of what I expect : (44317, 48120).
If you really want to use the word vectors from Fasttext, you will have to incorporate them into your model using a weight matrix and Embedding layer. The goal of the embedding layer is to map each integer sequence representing a sentence to its corresponding 300-dimensional vector representation:
import gensim.downloader as api
import numpy as np
import tensorflow as tf
def load_doc(filename):
file = open(filename, 'r')
text = file.read()
file.close()
return text
fasttext = api.load("fasttext-wiki-news-subwords-300")
embedding_dim = 300
in_filename = 'data.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(lines)
text_sequences = tokenizer.texts_to_sequences(lines)
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences, padding='post')
vocab_size = len(tokenizer.word_index) + 1
text_sequences = np.array(text_sequences)
X, y = text_sequences[:, :-1], text_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)
max_length = X.shape[1]
weight_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
try:
embedding_vector = fasttext[word]
weight_matrix[i] = embedding_vector
except KeyError:
weight_matrix[i] = np.random.uniform(-5, 5, embedding_dim)
sentence_input = tf.keras.layers.Input(shape=(max_length,))
x = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[weight_matrix],
input_length=max_length)(sentence_input)
x = tf.keras.layers.LSTM(100, return_sequences=True)(x)
x = tf.keras.layers.LSTM(100)(x)
x = tf.keras.layers.Dense(100, activation='relu')(x)
output = tf.keras.layers.Dense(vocab_size, activation='softmax')(x)
model = tf.keras.Model(sentence_input, output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=5, epochs=20)
Note that I am using the dataset and preprocessing steps from the tutorial you linked.
It's very difficult to train RNN models in next sentence prediction task. LSTM/GRU do not have enough resources to extract enough features from text.
There are 2 ways to solve issue:
predict chars instead of word class
use transformer model. For example, Bert is good for features extracting and predicting masked word
I know there are several questions about this here, but I haven't found one which fits exactly my problem.
I'm trying to fit an LSTM with data from Pandas DataFrames but getting confused about the format I have to provide them.
I created a small code snipped which shall show you what I try to do:
import pandas as pd, tensorflow as tf, random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
targets = pd.DataFrame(index=pd.date_range(start='2019-01-01', periods=300, freq='D'))
targets['A'] = [random.random() for _ in range(len(targets))]
targets['B'] = [random.random() for _ in range(len(targets))]
features = pd.DataFrame(index=targets.index)
for i in range(len(features)) :
features[str(i)] = [random.random() for _ in range(len(features))]
model = Sequential()
model.add(LSTM(units=targets.shape[1], input_shape=features.shape))
model.compile(optimizer='adam', loss='mae')
model.fit(features, targets, batch_size=10, epochs=10)
this results to:
ValueError: Input 0 of layer sequential is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [10, 300]
which I expect relates to the dimensions of the features DataFrame provided. I guess that once fixed this the next error would mention the targets DataFrame.
As far as I understand, 'units' parameter of my first layer defines the output dimensionality of this model. The inputs have to have a 3D shape, but I don't know how to create them out of the 2D world of the Data Frames.
I hope you can help me understanding the reshape mechanism in Python and how to use them in combination with Pandas DataFrames. (I'm quite new to Python and came from R)
Thankls in advance
Lets looks at the few popular ways in LSTMs are used.
Many to Many
Example: You have a sentence (composed of words in sequence). Give these sequence of words you would like to predict the Parts of speech (POS) of each word.
So you have n words and you feed each word per timestep to the LSTM. Each LSTM timestep (also called LSTM unwrapping) will produce and output. The word is represented by a a set of features normally word embeddings. So the input to LSTM is of size bath_size X time_steps X features
Keras code:
inputs = keras.Input(shape=(10,3))
lstm = keras.layers.LSTM(8, input_shape = (10, 3), return_sequences = True)(inputs)
outputs = keras.layers.TimeDistributed(keras.layers.Dense(5, activation='softmax'))(lstm)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
X = np.random.randn(4,10,3)
y = np.random.randint(0,2, size=(4,10,5))
model.fit(X, y, epochs=2)
print (model.predict(X).shape)
Many to One
Example: Again you have a sentence (composed of words in sequence). Give these sequence of words you would like to predict sentiment of the sentence if it is positive or negative.
Keras code
inputs = keras.Input(shape=(10,3))
lstm = keras.layers.LSTM(8, input_shape = (10, 3), return_sequences = False)(inputs)
outputs =keras.layers.Dense(5, activation='softmax')(lstm)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
X = np.random.randn(4,10,3)
y = np.random.randint(0,2, size=(4,5))
model.fit(X, y, epochs=2)
print (model.predict(X).shape)
Many to multi-headed
Example: You have a sentence (composed of words in sequence). Give these sequence of words you would like to predict sentiment of the sentence as well the author of the sentence.
This is multi-headed model where one head will predict the sentiment and another head will predict the author. Both the heads share the same LSTM backbone.
Keras code
inputs = keras.Input(shape=(10,3))
lstm = keras.layers.LSTM(8, input_shape = (10, 3), return_sequences = False)(inputs)
output_A = keras.layers.Dense(5, activation='softmax')(lstm)
output_B = keras.layers.Dense(5, activation='softmax')(lstm)
model = keras.Model(inputs=inputs, outputs=[output_A, output_B])
model.compile(loss='categorical_crossentropy', optimizer='adam')
X = np.random.randn(4,10,3)
y_A = np.random.randint(0,2, size=(4,5))
y_B = np.random.randint(0,2, size=(4,5))
model.fit(X, [y_A, y_B], epochs=2)
y_hat_A, y_hat_B = model.predict(X)
print (y_hat_A.shape, y_hat_B.shape)
What you are looking for is Many to Multi head model where your predictions for A will be made by one head and another head will make predictions for B
The input data for the LSTM has to be 3D.
If you print the shapes of your DataFrames you get:
targets : (300, 2)
features : (300, 300)
The input data has to be reshaped into (samples, time steps, features). This means that targets and features must have the same shape.
You need to set a number of time steps for your problem, in other words, how many samples will be used to make a prediction.
For example, if you have 300 days and 2 features the time step can be 3. So that three days will be used to make one prediction (you can choose this arbitrarily). Here is the code for reshaping your data (with a few more changes):
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
data = pd.DataFrame(index=pd.date_range(start='2019-01-01', periods=300, freq='D'))
data['A'] = [random.random() for _ in range(len(data))]
data['B'] = [random.random() for _ in range(len(data))]
# Choose the time_step size.
time_steps = 3
# Use numpy for the 3D array as it is easier to handle.
data = np.array(data)
def make_x_y(ts, data):
"""
Parameters
ts : int
data : numpy array
This function creates two arrays, x and y.
x is the input data and y is the target data.
"""
x, y = [], []
offset = 0
for i in data:
if offset < len(data)-ts:
x.append(data[offset:ts+offset])
y.append(data[ts+offset])
offset += 1
return np.array(x), np.array(y)
x, y = make_x_y(time_steps, data)
print(x.shape, y.shape)
nodes = 100 # This is the width of the network.
out_size = 2 # Number of outputs produced by the network. Same size as features.
model = Sequential()
model.add(LSTM(units=nodes, input_shape=(x.shape[1], x.shape[2])))
model.add(Dense(out_size)) # For the output a Dense (fully connected) layer is used.
model.compile(optimizer='adam', loss='mae')
model.fit(x, y, batch_size=10, epochs=10)
Well, just to finalize this issue I would like to provide one solution I have meanwhile worked on. The class TimeseriesGenerator in tf.keras.... enabled me quite easy to provide the data in the right shape to an LSTM model
from keras.preprocessing.sequence import TimeseriesGenerator
import numpy as np
window_size = 7
batch_size = 8
sampling_rate = 1
train_gen = TimeseriesGenerator(X_train.values, y_train.values,
length=window_size, sampling_rate=sampling_rate,
batch_size=batch_size)
valid_gen = TimeseriesGenerator(X_valid.values, y_valid.values,
length=window_size, sampling_rate=sampling_rate,
batch_size=batch_size)
test_gen = TimeseriesGenerator(X_test.values, y_test.values,
length=window_size, sampling_rate=sampling_rate,
batch_size=batch_size)
There are many other ways on implementing generators e.g. using the more_itertools which provides the function windowed, or making use of tensorflow.Dataset and its function window.
For me the TimeseriesGenerator was sufficient to feed the tests I did.
In case you would like to see an example modeling the DAX based on some stocks I'm sharing a notebook on Github.
I am currently using the skopt (scikit-optimize) package for hyperparameter tuning of a neural network (I am trying to minimize -1* accuracy). It seems to run fine (and successfully prints to the console) for several iterations before it raises Value Error: array must not contain infs or NaNs.
What are some possible causes of this? My data does not contain infs or NaNs and neither do my search parameter ranges. The neural network code is quite long, so for brevity, I will paste the relevant sections:
Imports:
import pandas as pd
import numpy as np
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer
from tensorflow.python.framework import ops
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras import backend as K
Creation of search parameters:
dim_num_filters_L1 = Integer(low=1, high=50, name='num_filters_L1')
#dim_kernel_size_L1 = Integer(low=1, high=70, name='kernel_size_L1')
dim_activation_L1 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L1')
dim_num_filters_L2 = Integer(low=1, high=50, name='num_filters_L2')
#dim_kernel_size_L2 = Integer(low=1, high=70, name='kernel_size_L2')
dim_activation_L2 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L2')
dim_num_dense_nodes = Integer(low=1, high=28, name='num_dense_nodes')
dim_activation_L3 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L3')
dim_dropout_rate = Real(low = 0, high = 0.5, name = 'dropout_rate')
dim_learning_rate = Real(low=1e-4, high=1e-2, name='learning_rate')
dimensions = [dim_num_filters_L1,
#dim_kernel_size_L1,
dim_activation_L1,
dim_num_filters_L2,
#dim_kernel_size_L2,
dim_activation_L2,
dim_num_dense_nodes,
dim_activation_L3,
dim_dropout_rate,
dim_learning_rate,
]
Function that creates all models that will be tested:
def create_model(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate):
input_shape = (X_train.shape[1], 1)
model = Sequential()
model.add(Conv1D(num_filters_L1, kernel_size = 40, activation = activation_L1, input_shape = input_shape))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(num_filters_L2, kernel_size=20, activation=activation_L2))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(num_dense_nodes, activation = activation_L3))
model.add(Dropout(dropout_rate))
model.add(Dense(y_train.shape[1], activation='linear'))
adam = tensorflow.keras.optimizers.Adam(learning_rate = learning_rate)
model.compile(optimizer=adam, loss='mean_squared_error', metrics=['accuracy'])
return model
Define fitness function:
#use_named_args(dimensions=dimensions)
def fitness(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate):
model = create_model(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate)
history_opt = model.fit(x=X_train,
y=y_train,
validation_data=(X_val,y_val),
shuffle=True,
verbose=2,
epochs=10
)
#return the validation accuracy for the last epoch.
accuracy_opt = model.evaluate(X_test,y_test)[1]
# Print the classification accuracy:
print("Experimental Model Accuracy: {0:.2%}".format(accuracy_opt))
# Delete the Keras model with these hyper-parameters from memory:
del model
# Clear the Keras session, otherwise it will keep adding new models to the same TensorFlow graph each time we create model with a different set of hyper-parameters.
K.clear_session()
ops.reset_default_graph()
# the optimizer aims for the lowest score, so return negative accuracy:
return -accuracy # or sum(RMSE)?
Run hyperparameter search:
gp_result = gp_minimize(func=fitness,
dimensions=dimensions)
print("best accuracy was " + str(round(gp_result.fun *-100,2))+"%.")
Your activation function is not converging in a random acquisition function call. I encountered this problem and removed 'relu' function from search space.
Currently I can train a LSTM network using one csv file based on this tutorial: https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
This code generate sliding windows where the last n_steps of the features are saved to predict the actual target (similar to this: Keras LSTM - feed sequence data with Tensorflow dataset API from the generator):
#%% Import
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.models import Sequential, model_from_json
from tensorflow.python.keras.layers import LSTM
from tensorflow.python.keras.layers import Dense
# for path
import pathlib
import os
#%% Define functions
# Function to split multivariate input data into samples according to the number of timesteps (n_steps) used for the prediction ("sliding window")
def split_sequences(sequences, n_steps):
X, y = list(), list()
for i in range(len(sequences)):
# find end of this pattern
end_ix = i + n_steps
# check if beyond maximum index of input data
if end_ix > len(sequences):
break
# gather input and output parts of the data in corresponding format (depending on n_steps)
seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
X.append(seq_x)
y.append(seq_y)
#Append: Adds its argument as a single element to the end of a list. The length of the list increases by one.
return array(X), array(y)
# Set source files
csv_train_path = os.path.join(dir_of_file, 'SimulationData', 'SimulationTrainData', 'SimulationTrainData001.csv')
# Load data
df_train = pd.read_csv(csv_train_path, header=0, parse_dates=[0], index_col=0)
#%% Select features and target
features_targets_considered = ['Fz1', 'Fz2', 'Fz3', 'Fz4', 'Fz5', 'Fz_res']
n_features = len(features_targets_considered)-1 # substract the target
features_targets_train = df_train[features_targets_considered]
# "Convert" to array
train_values = features_targets_train.values
# Set number of previous timesteps, which are considered to predict
n_steps = 100
# Convert into input (400x5) and output (1) values
X, y = split_sequences(train_values, n_steps)
X_test, y_test = split_sequences(test_values, n_steps)
#%% Define model
model = Sequential()
model.add(LSTM(200, activation='relu', return_sequences=True, input_shape=(n_steps, n_features)))
model.add(LSTM(200, activation='relu', return_sequences=True))
model.add(LSTM(200, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
#%% Fit model
history = model.fit(X, y, epochs=200, verbose=1)
I now want to expand this example to efficiently train the network with different csv files. In the data folder I have the files 'SimulationTrainData001.csv', 'SimulationTrainData002.csv', ..., 'SimulationTrainData300.csv' (about 14 GB).
To achieve this, I tried to adopt the code of this input pipeline example: https://www.tensorflow.org/guide/data#consuming_sets_of_files, which works to a certain extend. I can show the training files in the folder with this change:
# Set source folders
csv_train_path = os.path.join(dir_of_file, 'SimulationData', 'SimulationTrainData')
csv_train_path = pathlib.Path(csv_train_path)
#%% Show five example files from training folder
list_ds = tf.data.Dataset.list_files(str(csv_train_path/'*'))
for f in list_ds.take(5):
print(f.numpy())
One problem is, that in the example the files are pictures of flowers and not time series values and I do not know at which point I can use the split_sequences(sequences, n_steps) function to create the sliding windows to provide the necessary data format to train the LSTM network.
Also, as far as I know, it would be better for the training process, if the generated windows of the different files would be shuffled. I could use the split_sequences(sequences, n_steps) function on every csv file (to generate X_test , y_test) and join the result in one big variable or file and shuffle the windows, but I do not think this is an efficient way and it also had to be redone if n_steps will be changed.
If somebody could suggest a (established) method or example to preprocess my data, I would be very thankful.
You can use the TimeSeriesGenerator after consuming those sets of files.
Here is the reference link.
As per the documentation:
'''
This class takes in a sequence of data-points gathered at equal intervals, along with time-series parameters such as stride, length of history, etc., to produce batches for training/validation.
'''
Provided examples for both univariate & multiple variate scenario
Univariate Example:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM
import numpy as np
import tensorflow as tf
# define dataset
series = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# reshape to [10, 1]
n_features = 1
series = series.reshape((len(series), n_features))
# define generator
n_input = 2
generator = TimeseriesGenerator(series, series, length=n_input, batch_size=8)
# create model
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(n_input, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
# fit model
model.fit_generator(generator, steps_per_epoch=1, epochs=500, verbose=1)
#sample prediction
inputs = np.array([9, 10]).reshape((1, n_input, n_features))
result = model.predict(inputs, verbose=0)
print(result)
Multi-variate Example
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM
import numpy as np
import tensorflow as tf
# define dataset
in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95, 105])
# reshape series
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
# horizontally stack columns
dataset = np.hstack((in_seq1, in_seq2))
# define generator
n_features = dataset.shape[1]
n_input = 2
generator = TimeseriesGenerator(dataset, dataset, length=n_input, batch_size=8)
# define model
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(n_input, n_features)))
model.add(Dense(2))
model.compile(optimizer='adam', loss='mse')
# fit model
model.fit_generator(generator, steps_per_epoch=1, epochs=500, verbose=1)
# make a one step prediction out of sample
inputs = np.array([[90, 95], [100, 105]]).reshape((1, n_input, n_features))
result = model.predict(inputs, verbose=1)
print(result)
Note: All of these were simulated using Google Colaboratory
I am currently trying to build a model to classify whether or not the outcome of a given football match will be above or below 2.5 goals, based on the Home team, Away team & game league, using a tf.keras.Sequential model in TensorFlow 2.0RC.
The problem I am encountering is that my softmax results converge on [0.5,0.5] when using the model.predict method. What makes this odd is that my validation & test accuracy and losses are about 0.94 & 0.12 respectively after 1000 epochs of training, otherwise I would have put this down to an overfitting problem. I am aware that 1000 epochs is extremely likely to overfit, however, I want to understand why my accuracy increases until about 800 epochs in. My loss flattens at about 300 epochs.
I have tried to alter the number of layers, number of units in each layer, the activation functions, optimizers and loss functions, number of epochs and learning rates, but can only seem to increase the losses.
The results still seem to converge toward [0.5,0.5] regardless.
The full code can be viewed at https://github.com/AhmUgEk/tensorflow_football_predictions, but below is an extract showing model composition.
# Create Keras Sequential model:
model = keras.Sequential()
model.add(feature_layer) # Input processing layer.
model.add(Dense(units=32, activation='relu')) # Hidden Layer 1.
model.add(Dropout(rate=0.4))
model.add(BatchNormalization())
model.add(Dense(units=32, activation='relu')) # Hidden Layer 2.
model.add(Dropout(rate=0.4))
model.add(BatchNormalization())
model.add(Dense(units=2, activation='softmax')) # Output layer.
# Compile the model:
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.0001),
loss=keras.losses.MeanSquaredLogarithmicError(),
metrics=['accuracy']
)
# Compile the model:
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.0001),
loss=keras.losses.MeanSquaredLogarithmicError(),
metrics=['accuracy']
)
# Fit the model to the training dataset and validate against the
validation dataset between epochs:
model.fit(
train_dataset,
validation_data=val_dataset,
epochs=1000,
callbacks=[tensorboard_callback]
)
I would expect to receive a result of [0.282, 0.718] for example for an input of:
model.predict_classes([np.array(['E0'], dtype='object'),
np.array(['Liverpool'], dtype='object'),
np.array(['Newcastle'], dtype='object')])[0]
but as per the above, receive a result of say [0.5, 0.5].
Am I missing something obvious here?
I had made some minor changes in the model. Now, I am not getting exactly [0.5, 0.5].
Result:
[[0.61482537 0.3851746 ]
[0.5121426 0.48785746]
[0.48058605 0.51941395]
[0.48913187 0.51086813]
[0.45480043 0.5451996 ]
[0.48933673 0.5106633 ]
[0.43431875 0.5656812 ]
[0.55314165 0.4468583 ]
[0.5365097 0.4634903 ]
[0.54371756 0.45628244]]
Implementation:
import datetime
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from gpu_limiter import limit_gpu
from pipe_functions import csv_to_df, dataframe_to_dataset
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import BatchNormalization, Dense, DenseFeatures, Dropout, Input
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.data import Dataset
# Test GPU availability and instantiate memory growth limitation if True:
if tf.test.is_gpu_available():
print('GPU Available\n')
limit_gpu()
else:
print('Running on CPU')
df = csv_to_df("./csv_files")
# Format & organise imported data, making the "Date" column the new index:
df['Date'] = pd.to_datetime(df['Date'])
df = df[['Date', 'Div', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']].dropna().set_index('Date').sort_index()
df['Over_2.5'] = (df['FTHG'] + df['FTAG'] > 2.5).astype(int)
df = df.drop(['FTHG', 'FTAG'], axis=1)
# Split data into training, validation and testing data:
# Note: random_state variable set to ensure reproducibility.
train, test = train_test_split(df, test_size=0.05, random_state=42)
train, val = train_test_split(train, test_size=0.05, random_state=42)
# print(df['Over_2.5'].value_counts()) # Check that data is balanced.
# Create datasets from train, val & test dataframes:
target_col = 'Over_2.5'
batch_size = 32
def df_to_dataset(features: np.ndarray, labels: np.ndarray, shuffle=True, batch_size=8) -> Dataset:
ds = Dataset.from_tensor_slices(({"feature": features}, {"target": labels}))
if shuffle:
ds = ds.shuffle(buffer_size=len(features))
ds = ds.batch(batch_size)
return ds
def get_feature_transform() -> DenseFeatures:
# Format features into feature columns to ensure data is in the correct format for feeding into the model:
feature_cols = []
for column in filter(lambda x: x != target_col, df.columns):
feature_cols.append(tf.feature_column.embedding_column(tf.feature_column.categorical_column_with_vocabulary_list(
key=column, vocabulary_list=df[column].unique()), dimension=5))
return DenseFeatures(feature_cols)
# Transforms all features into dense tensors.
feature_transform = get_feature_transform()
train_features = feature_transform(dict(train)).numpy()
val_features = feature_transform(dict(val)).numpy()
test_features = feature_transform(dict(test)).numpy()
train_dataset = df_to_dataset(train_features, train[target_col].values, shuffle=True, batch_size=batch_size)
val_dataset = df_to_dataset(val_features, val[target_col].values, shuffle=True, batch_size=batch_size) # Shuffle not required to validation data.
test_dataset = df_to_dataset(test_features, test[target_col].values, shuffle=True, batch_size=batch_size) # Shuffle not required to test data.
# Create Keras Functional API:
# Create a feature layer from the feature columns, to be placed at the input layer of the model:
def build_model(input_shape: tuple) -> keras.Model:
input_layer = keras.Input(shape=input_shape, name='feature')
model = Dense(units=1028, activation='relu', kernel_initializer='normal', name='dense0')(input_layer) # Hidden Layer 1.
model = BatchNormalization(name='bc0')(model)
model = Dense(units=1028, activation='relu', kernel_initializer='normal', name='dense1')(model) # Hidden Layer 2.
model = Dropout(rate=0.1)(model)
model = BatchNormalization(name='bc1')(model)
model = Dense(units=100, activation='relu', kernel_initializer='normal', name='dense2')(model) # Hidden Layer 3.
model = Dropout(rate=0.25)(model)
model = BatchNormalization(name='bc2')(model)
model = Dense(units=50, activation='relu', kernel_initializer='normal', name='dense3')(model) # Hidden Layer 4.
model = Dropout(rate=0.4)(model)
model = BatchNormalization(name='bc3')(model)
output_layer = Dense(units=2, activation='softmax', kernel_initializer='normal', name='target')(model) # Output layer.
model = keras.Model(inputs=input_layer, outputs=output_layer, name='better-than-chance')
# Compile the model:
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.001),
loss='mse',
metrics=['accuracy']
)
return model
# # Create a TensorBoard log file (time appended) directory for every run of the model:
# directory = ".\\logs\\" + str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# os.mkdir(directory)
# # Create a TensorBoard callback to log a record of model performance for every 1 epoch:
# tensorboard_callback = TensorBoard(log_dir=directory, histogram_freq=1, write_graph=True, write_images=True)
# Run "tensorboard --logdir .\logs" in anaconda prompt to review & compare logged results.
# Note: Make sure that the correct environment is activated before running.
model = build_model((train_features.shape[1],))
model.summary()
# checkpoint = ModelCheckpoint('model-{epoch:03d}.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')
# Fit the model to the training dataset and validate against the validation dataset between epochs:
model.fit(
train_dataset,
validation_data=val_dataset,
epochs=10)
# callbacks=[checkpoint]
# Saves and reloads model.
# model.save("./model.h5")
# model_from_saved = keras.models.load_model("./model.h5")
# Evaluate model accuracy against test dataset:
# scores, accuracy = model.evaluate(train_dataset)
# print('Accuracy:', accuracy)
##############
## OPTIONAL ##
##############
# DUBUGGING
# inp = model.input # input placeholder
# outputs = [layer.output for layer in model.layers] # all layer outputs
# functors = [K.function([inp], [out]) for out in outputs] # evaluation functions
# # Testing
# layer_outs = [func([test_features]) for func in functors]
# print(layer_outs)
# # # Form a prediction based on inputs:
prediction = model.predict({"feature": test_features[:10]})
print(prediction)
One thing you can do is to try some ensemble Learning methods like
RandomForest
and
XGBoost
and compare the results.
You should try is to add other Key Performance Indicators(KPI)s in
your data and then try to fit the model.