TF 2.0 MLP accuracy always zero - python

I've written a minimal example of a simple neural network that fits a given function (a multilayer perceptron for regression).
During the training process the loss decresses as expected and the model works fine. However, the accuracy remains constant and equal to 0.0 at all times, and I don't understand why. What am I missing here?
I guess there is some technical detail that prevents the accuracy from updating?
The training process and the resulting model can be seen in this link
Thank you very much for any help you can provide! ;)
PS- Here is a minimal example to reproduce this result:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
# Create TRAINING data
noise = 0.1
N=500
Xt = np.random.uniform(-np.pi, np.pi, size=(N,))
Yt = np.sin(Xt) + noise * np.random.uniform(-1,1,size=Xt.shape)
# Create VALIDATION data
Nv = int(0.1*N)
Xv = np.random.uniform(-np.pi, np.pi, size=(Nv,))
Yv = np.sin(Xv) + noise * np.random.uniform(-1,1,size=Xv.shape)
# Create model
model = Sequential()
model.add( Dense(10, activation='tanh',input_shape=(1,)) )
model.add( Dense(5, activation='tanh') )
model.add( Dense(1, activation=None) )
model.compile(optimizer='adam',
loss='mse',
metrics=['accuracy'])
# Fit & evaluate
history = model.fit(Xt, Yt, validation_data=(Xv,Yv),
epochs=100,
verbose=2)
results = model.evaluate(Xv, Yv,verbose=0)
print('\n\nEvaluating model, loss/acc:', results)
## PLOTS
fig = plt.figure()
gs = gridspec.GridSpec(2, 2)
ax1 = plt.subplot(gs[0,0]) # losses
ax2 = plt.subplot(gs[1,0], sharex=ax1) # accuracies
ax3 = plt.subplot(gs[:,1]) # data & model
# Plot learning curve
err = history.history['loss']
val_err = history.history['val_loss']
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
ax1.plot(err,label='loss')
ax1.plot(val_err,label='val_loss')
ax2.plot(acc,label='accuracy')
ax2.plot(val_acc,label='val_accuracy')
ax1.set_ylim(bottom=0)
ax2.set_ylim(bottom=-0.01)
ax1.legend()
ax2.legend()
# Plot test
# Generate "continous" data for pretty test
x = np.linspace(np.min(Xt),np.max(Xt),1000)
y = model.predict(x)
ax3.scatter(Xt, Yt, label='Training')
ax3.scatter(Xv, Yv, c='C2', label='Validation')
ax3.plot(x, y, 'C3-', lw=4, label='Model')
ax3.legend()
fig.tight_layout()
plt.show()

As Swier pointed out in the comment accuracy is meant for classification.
Nevertheless I thought that some points should yield the exact target value, that's why I was expecting acc>0.
Anyway I mapped the problem to a integer-only problem and in that scenario the accuracy is different from zero. Obviously not a useful metric, but at least it makes (mathematical) sense.
Thanks!!

Related

Time series prediction with LSTM: How to update my code to predict beyond the dataset

I have created an LSTM predictor model that works very good on the train (8 years) and test (2 years) sets, but I now need to predict beyond the dates in the entire dataset.
I would like the predictions to go beyond the data in the dataset (after the 10 years).
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
from rnn import RNN
import numpy as np
from torch import nn
import torch
from torch.autograd import variable
ss_X_dep = StandardScaler()
ss_y_dep = StandardScaler()
def rmse(y1, y2):
return np.sqrt(mean_squared_error(y1, y2))
data = pd.read_csv('data/W042 RAIS/W042-Pz-4-Rais-SEAAl62-70.csv')
Inputs = data.drop('Year', axis=1).drop('Depth', axis=1)
Outputs = data['Depth']
Inputs = Inputs.to_numpy()
Outputs = Outputs.to_numpy().reshape(-1, 1)
# First 08 years of data
X_train_dep = Inputs[:62]
y_train_dep = Outputs[:62]
# Last 02 years of data
X_test_dep = Inputs[62:]
print("X_train_dep shape", X_train_dep.shape)
print("y_train_dep shape", y_train_dep.shape)
print("X_test_dep shape", X_test_dep.shape)
X = np.concatenate([X_train_dep, X_test_dep], axis=0)
# Standardization (Normalisation)
X = ss_X_dep.fit_transform(X)
# First 08 years of data
X_train_dep_std = X[:62]
y_train_dep_std = ss_y_dep.fit_transform(y_train_dep)
# All 10 years of data
X_test_dep_std = X
X_train_dep_std = np.expand_dims(X_train_dep_std, axis=0)
y_train_dep_std = np.expand_dims(y_train_dep_std, axis=0)
X_test_dep_std = np.expand_dims(X_test_dep_std, axis=0)
# Transfer to Pytorch Variable
X_train_dep_std = variable(torch.from_numpy(X_train_dep_std).float())
y_train_dep_std = variable(torch.from_numpy(y_train_dep_std).float())
X_test_dep_std = variable(torch.from_numpy(X_test_dep_std).float())
# Define rnn model
model = RNN(input_size=5, hidden_size=40, num_layers=1, class_size=1, dropout=0.5, rnn_type='lstm')
# Define optimization function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # -4 optimize all rnn parameters
# Define loss function
loss_func = nn.MSELoss()
# Start training
for iter in range(2000+1): # 20000 Iterations
model.train()
prediction = model(X_train_dep_std)
#X_train_dep_std.append(prediction)
# prediction = model(X_train_dep_std)
loss = loss_func(prediction, y_train_dep_std)
optimizer.zero_grad() # clear gradients for this training step
loss.backward() # back propagation, compute gradients
optimizer.step()
if iter % 100 == 0:
print("iteration: %s, loss: %s" % (iter, loss.item()))
# Save model
save_filename = 'checkpoints/LSTM_DOUBLE_FC.pth'
torch.save(model, save_filename)
print('Saved as %s' % save_filename)
# Start evaluating model
model.eval()
y_pred_dep_ = model(X_test_dep_std).detach().numpy()
y_pred_dep = ss_y_dep.inverse_transform(y_pred_dep_[0, 0:])
print('The value of Root mean squared error (RMSE) of water table depth is :', rmse(Outputs[0:], y_pred_dep))
print('The value of mean squared error (MSE) of water table depth is :', mean_squared_error(Outputs[0:], y_pred_dep))
print('The value of R-squared (R2) of water table depth is :', r2_score(Outputs[0:], y_pred_dep))
f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(15, 7))
ax1.plot(Outputs[0:], color="blue", linestyle="-", linewidth=2.5, label="Measurements")
ax1.plot(y_pred_dep, color="r", linestyle="--", linewidth=2.5, label="Proposed model")
plt.legend(loc='upper center')
plt.xticks(fontsize=10,fontweight='normal')
plt.yticks(fontsize=10,fontweight='normal')
plt.title('Predictions LSTM model W042-Pz-4-Rais-SEAAl 62-66', fontsize=15)
plt.xlabel('Mois d_apres le 2009-09', fontsize=15)
plt.ylabel('Variation de niveau statique NS (m)', fontsize=15)
plt.xlim(0, 85)
plt.savefig('./plots/lstm_doubl_aGood_Result.png', format='png')
plt.show()

Why my predictions of LSTM so low and have shape of the last examples?

I'm writing model LSTM for predicting next stock prices. This model shows good test results, but when I am trying to predict next values after initial dataset it shows extremely low values with shape of the last window values. What can I do to prevent this situation?
My model
def create_model():
model = Sequential()
model.add(LSTM(units=128, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=64, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=64))
model.add(Dense(units=1))
model.compile(loss='mean_squared_error', optimizer='adam')
return model
Function to prepare dataset for fit and testing
def create_dataset(df, window):
x = []
y = []
for i in range(window, df.shape[0]):
x.append(df[i-window:i, 0])
y.append(df[i, 0])
x = np.array(x)
y = np.array(y)
return (x, y)
Here I am checking on the whole dataset
dataset_valid = np.array(df)
dataset_valid = scaler.transform(dataset_valid)
dataset_valid = np.reshape(dataset_valid, (dataset_valid.shape[0], dataset_valid.shape[1], 1))
x_dataset_valid, y_dataset_valid = create_dataset(dataset_valid, window)
predict = model.predict(x_dataset_valid)
predict = scaler.inverse_transform(predict)
dataset_valid = scaler.inverse_transform(y_dataset_valid)
plt.figure(figsize=(16, 8))
plt.plot(dataset_valid, color='r', label='Original')
plt.plot(predict, color='b', label='Predicted')
plt.legend()
plt.show()
And here I am trying to predict values after dataset
dataset_valid = np.array(df)
dataset_valid = scaler.transform(dataset_valid)
dataset_valid = create_predict(dataset_valid, window)
predict = model.predict(dataset_valid)
predict = scaler.inverse_transform(predict)
predict = np.append(np.array([0] * window), predict)
dataset_valid_1 = np.array(df[:])
dataset_valid_1 = scaler.transform(dataset_valid_1)
predict_1 = model.predict(dataset_valid_1[-window:])
predict_1 = scaler.inverse_transform(predict_1)
predict = np.append(predict, predict_1)
plt.figure(figsize=(16, 8))
plt.plot(df, color='r', label='Original')
plt.plot(predict, color='b', label='Predicted')
plt.legend()
plt.show()
I really have no idea what I am doing wrong
I've tried create function like dataset[-window:], tried predict only next value and after that append this value and repeat, but nothing works

Why are Keras Conv1D weights not changed during training?

I initialise my network with only one Convolutional layer (8 filters with length 10).
# Initialize Convolutional Neural Network
cnn = Sequential()
conv = Conv1D(filters=8, kernel_size=10, strides=1, padding="same", input_shape=(train.values.shape[1]-1, 1))
cnn.add(conv)
cnn.add(Activation("relu"))
cnn.add(MaxPooling1D(pool_size=2, strides=2, padding="same"))
cnn.add(Flatten())
cnn.add(Dense(2, activation='softmax'))
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
cnn.summary()
I take the weights once before and once after the training and plot them with a function I wrote.
w1 = conv.get_weights()[0][:, 0, :]
print(w1[:,0])
plot_weights(w1)
# Fit CNN
y = to_categorical(train.values[:, -1])
X_cnn = np.expand_dims(train.values[:, :-1], axis=2)
start = time.time()
cnn.fit(X_cnn, y, verbose=1, batch_size=20, validation_split=0.2, epochs=20)
end = time.time()
w2 = conv.get_weights()[0][:, 0, :]
print(w2[:,0])
plot_weights(w2)
Function to plot the weights:
def plot_weights(w):
w_min = w.min()
w_max = w.max()
n = w.shape[0]
fig, axes = plt.subplots(nrows=8, ncols=1)
for i, ax in enumerate(axes.flat):
im = ax.imshow(w[:, i].reshape(1, n), vmin=w_min, vmax=w_max, interpolation="nearest",
cmap="gray") # Display weights as image
plt.setp(ax.get_yticklabels(), visible=False) # Hide y ticks
ax.tick_params(axis='y', which='both', length=0) # Set length of y ticks to 0
fig.colorbar(im, ax=axes.ravel().tolist())
plt.show(block=False)
return
The output looks like this:
Before training
After training
When i print the first filter before and after training, you can also see that it is the exact same numbers (not even slightly changed).
>>>[-0.20076838 0.03835052 -0.04454999 -0.20220913 0.24402907 0.03407234
-0.09768075 0.16887552 0.12767741 0.00756356]
>>>[-0.20076838 0.03835052 -0.04454999 -0.20220913 0.24402907 0.03407234
-0.09768075 0.16887552 0.12767741 0.00756356]
What is the reason for this behaviour? Am I doing something wrong? The network is clearly learning something, since i get an accuracy of nearly 100%.
--ga97dil
You may need to access the model itself that is being trained, i.e. cnn rather than the definition you use to initialise the layer i.e. conv.
Try cnn.layers[0].get_weights()[:, 0, :] instead of conv.get_weights()[0][:, 0, :].

Concatenating a time-series neural net with a feedforward neural net

Consider the following example problem:
# dummy data for a SO question
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
from keras.models import Model
from keras.layers import Input, Conv1D, Dense
from keras.optimizers import Adam, SGD
time = np.array(range(100))
brk = np.array((time>40) & (time < 60)).reshape(100,1)
B = np.array([5, -5]).reshape(1,2)
np.dot(brk, B)
y = np.c_[np.sin(time), np.sin(time)] + np.random.normal(scale = .2, size=(100,2))+ np.dot(brk, B)
plt.clf()
plt.plot(time, y[:,0])
plt.plot(time, y[:,1])
You've got N time series, and they've got one component that follows a common process, and another component that is idiosyncratic to the series itself. Assume for simplicity that you know a priori that the bump is between 40 and 60, and you want to model it simultaneously with the sinusoidal component.
A TCN does a good job on the common component, but it can't get the series-idiosyncratic component:
# time series model
n_filters = 10
filter_width = 3
dilation_rates = [2**i for i in range(7)]
inp = Input(shape=(None, 1))
x = inp
for dilation_rate in dilation_rates:
x = Conv1D(filters=n_filters,
kernel_size=filter_width,
padding='causal',
activation = "relu",
dilation_rate=dilation_rate)(x)
x = Dense(1)(x)
model = Model(inputs = inp, outputs = x)
model.compile(optimizer = Adam(), loss='mean_squared_error')
model.summary()
X_train = np.transpose(np.c_[time, time]).reshape(2,100,1)
y_train = np.transpose(y).reshape(2,100,1)
history = model.fit(X_train, y_train,
batch_size=2,
epochs=1000,
verbose = 0)
yhat = model.predict(X_train)
plt.clf()
plt.plot(time, y[:,0])
plt.plot(time, y[:,1])
plt.plot(time, yhat[0,:,:])
plt.plot(time, yhat[1,:,:])
On the other hand, a basic linear regression with N outputs (here implemented in Keras) is perfect for the idiosyncratic component:
inp1 = Input((1,))
x1 = inp1
x1 = Dense(2)(x1)
model1 = Model(inputs = inp1, outputs = x1)
model1.compile(optimizer = Adam(), loss='mean_squared_error')
model1.summary()
brk_train = brk
y_train = y
history = model1.fit(brk_train, y_train,
batch_size=100,
epochs=6000, verbose = 0)
yhat1 = model1.predict(brk_train)
plt.clf()
plt.plot(time, y[:,0])
plt.plot(time, y[:,1])
plt.plot(time, yhat1[:,0])
plt.plot(time, yhat1[:,1])
I want to use keras to jointly estimate the time series component and the idiosyncratic component. The major problem is that feed-forward networks (which linear regression is a special case of) take shape batch_size x dims while time series networks take dimension batch_size x time_steps x dims.
Because I want to jointly estimate the idiosyncratic part of the model (the linear regression part) together with the time series part, I'm only ever going to batch-sample whole time-series. Which is why I specified batch_size = time_steps for model 1.
But in the static model, what I'm really doing is modeling my data as time_steps x dims.
I have tried to re-cast the feed-forward model as a time-series model, without success. Here's the non-working approach:
inp3 = Input(shape = (None, 1))
x3 = inp3
x3 = Dense(2)(x3)
model3 = Model(inputs = inp3, outputs = x3)
model3.compile(optimizer = Adam(), loss='mean_squared_error')
model3.summary()
brk_train = brk.reshape(1, 100, 1)
y_train = np.transpose(y).reshape(2,100,1)
history = model3.fit(brk_train, y_train,
batch_size=1,
epochs=1000, verbose = 1)
ValueError: Error when checking target: expected dense_40 to have shape (None, 2) but got array with shape (100, 1)
I am trying to fit the same model as model1, but with a different shape, so that it is compatible with the TCN model -- and importantly so that it will have the same batching structure.
The output should ultimately have the shape (2, 100, 1) in this example. Basically I want the model to do the following algorithm:
ingest X of shape (N, time_steps, dims)
Lose the first dimension, because the design matrix is going to be identical for every series, yielding X1 of shape (time_steps, dims)
Forward step: np.dot(X1, W), where W is of dimension (dims, N), yielding X2 of dimension (time_steps, N)
Reshape X2 to (N, time_steps, 1). Then I can add it to the output of the other part of the model.
Backwards step: since this is just a linear model, the gradient of W with respect to the output is just X1
How can I implement this? Do I need a custom layer?
I'm building off of ideas in this paper, in case you're curious about the motivation behind all of this.
EDIT: After posting, I noticed that I used only the time variable, rather than the time series itself. A TCN fit with the lagged series fits the idiosyncratic part of the series just fine (in-sample anyway). But my basic question still stands -- I want to merge the two types of networks.
So, I solved my own problem. The answer is to create dummy interactions (and a thus a really sparse design matrix) and then reshape the data.
###########################
# interaction model
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
from keras.models import Model
from keras.layers import Input, Conv1D, Dense
from keras.optimizers import Adam, SGD
from patsy import dmatrix
def shift5(arr, num, fill_value=np.nan):
result = np.empty_like(arr)
if num > 0:
result[:num] = fill_value
result[num:] = arr[:-num]
elif num < 0:
result[num:] = fill_value
result[:num] = arr[-num:]
else:
result = arr
return result
time = np.array(range(100))
brk = np.array((time>40) & (time < 60)).reshape(100,1)
B = np.array([5, -5]).reshape(1,2)
np.dot(brk, B)
y = np.c_[np.sin(time), np.sin(time)] + np.random.normal(scale = .2, size=(100,2))+ np.dot(brk, B)
plt.clf()
plt.plot(time, y[:,0])
plt.plot(time, y[:,1])
# define interaction model
inp = Input(shape=(None, 2))
x = inp
x = Dense(1)(x)
model = Model(inputs = inp, outputs = x)
model.compile(optimizer = Adam(), loss='mean_squared_error')
model.summary()
from patsy import dmatrix
df = pd.DataFrame(data = {"fips": np.concatenate((np.zeros(100), np.ones(100))),
"brk": np.concatenate((brk.reshape(100), brk.squeeze()))})
df.brk = df.brk.astype(int)
tm = np.asarray(dmatrix("brk:C(fips)-1", data = df))
brkint = np.concatenate(( \
tm[:100,:].reshape(1,100,2),
tm[100:200,:].reshape(1,100,2)
), axis = 0)
y_train = np.transpose(y).reshape(2,100,1)
history = model.fit(brkint, y_train,
batch_size=2,
epochs=1000,
verbose = 1)
yhat = model.predict(brkint)
plt.clf()
plt.plot(time, y[:,0])
plt.plot(time, y[:,1])
plt.plot(time, yhat[0,:,:])
plt.plot(time, yhat[1,:,:])
The output shape is the same as for the TCN, and can simply be added element-wise.

Why Bother With Recurrent Neural Networks For Structured Data?

I have been developing feedforward neural networks (FNNs) and recurrent neural networks (RNNs) in Keras with structured data of the shape [instances, time, features], and the performance of FNNs and RNNs has been the same (except that RNNs require more computation time).
I have also simulated tabular data (code below) where I expected a RNN to outperform a FNN because the next value in the series is dependent on the previous value in the series; however, both architectures predict correctly.
With NLP data, I have seen RNNs outperform FNNs, but not with tabular data. Generally, when would one expect a RNN to outperform a FNN with tabular data? Specifically, could someone post simulation code with tabular data demonstrating a RNN outperforming a FNN?
Thank you! If my simulation code is not ideal for my question, please adapt it or share a more ideal one!
from keras import models
from keras import layers
from keras.layers import Dense, LSTM
import numpy as np
import matplotlib.pyplot as plt
Two features were simulated over 10 time steps, where the value of the second feature is dependent on the value of both features in the prior time step.
## Simulate data.
np.random.seed(20180825)
X = np.random.randint(50, 70, size = (11000, 1)) / 100
X = np.concatenate((X, X), axis = 1)
for i in range(10):
X_next = np.random.randint(50, 70, size = (11000, 1)) / 100
X = np.concatenate((X, X_next, (0.50 * X[:, -1].reshape(len(X), 1))
+ (0.50 * X[:, -2].reshape(len(X), 1))), axis = 1)
print(X.shape)
## Training and validation data.
split = 10000
Y_train = X[:split, -1:].reshape(split, 1)
Y_valid = X[split:, -1:].reshape(len(X) - split, 1)
X_train = X[:split, :-2]
X_valid = X[split:, :-2]
print(X_train.shape)
print(Y_train.shape)
print(X_valid.shape)
print(Y_valid.shape)
FNN:
## FNN model.
# Define model.
network_fnn = models.Sequential()
network_fnn.add(layers.Dense(64, activation = 'relu', input_shape = (X_train.shape[1],)))
network_fnn.add(Dense(1, activation = None))
# Compile model.
network_fnn.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Fit model.
history_fnn = network_fnn.fit(X_train, Y_train, epochs = 10, batch_size = 32, verbose = False,
validation_data = (X_valid, Y_valid))
plt.scatter(Y_train, network_fnn.predict(X_train), alpha = 0.1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()
plt.scatter(Y_valid, network_fnn.predict(X_valid), alpha = 0.1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()
LSTM:
## LSTM model.
X_lstm_train = X_train.reshape(X_train.shape[0], X_train.shape[1] // 2, 2)
X_lstm_valid = X_valid.reshape(X_valid.shape[0], X_valid.shape[1] // 2, 2)
# Define model.
network_lstm = models.Sequential()
network_lstm.add(layers.LSTM(64, activation = 'relu', input_shape = (X_lstm_train.shape[1], 2)))
network_lstm.add(layers.Dense(1, activation = None))
# Compile model.
network_lstm.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Fit model.
history_lstm = network_lstm.fit(X_lstm_train, Y_train, epochs = 10, batch_size = 32, verbose = False,
validation_data = (X_lstm_valid, Y_valid))
plt.scatter(Y_train, network_lstm.predict(X_lstm_train), alpha = 0.1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()
plt.scatter(Y_valid, network_lstm.predict(X_lstm_valid), alpha = 0.1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()
In practice even in NLP you see that RNNs and CNNs are often competitive. Here's a 2017 review paper that shows this in more detail. In theory it might be the case that RNNs can handle the full complexity and sequential nature of language better but in practice the bigger obstacle is usually properly training the network and RNNs are finicky.
Another problem that might have a chance of working would be to look at a problem like the balanced parenthesis problem (either with just parentheses in the strings or parentheses along with other distractor characters). This requires processing the inputs sequentially and tracking some state and might be easier to learn with a LSTM then a FFN.
Update:
Some data that looks sequential might not actually have to be treated sequentially. For example even if you provide a sequence of numbers to add since addition is commutative a FFN will do just as well as a RNN. This could also be true of many health problems where the dominating information is not of a sequential nature. Suppose every year a patient's smoking habits are measured. From a behavioral standpoint the trajectory is important but if you're predicting whether the patient will develop lung cancer the prediction will be dominated by just the number of years the patient smoked (maybe restricted to the last 10 years for the FFN).
So you want to make the toy problem more complex and to require taking into account the ordering of the data. Maybe some kind of simulated time series, where you want to predict whether there was a spike in the data, but you don't care about absolute values just about the relative nature of the spike.
Update2
I modified your code to show a case where RNNs perform better. The trick was to use more complex conditional logic that is more naturally modeled in LSTMs than FFNs. The code is below. For 8 columns we see that the FFN trains in 1 minute and reaches a validation loss of 6.3. The LSTM takes 3x longer to train but it's final validation loss is 6x lower at 1.06.
As we increase the number of columns the LSTM has a larger and larger advantage, especially if we added more complicated conditions in. For 16 columns the FFNs validation loss is 19 (and you can more clearly see the training curve as the model isn't able to instantly fit the data). In comparison the LSTM takes 11 times longer to train but has a validation loss of 0.31, 30 times smaller than the FFN! You can play around with even larger matrices to see how far this trend will extend.
from keras import models
from keras import layers
from keras.layers import Dense, LSTM
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import time
matplotlib.use('Agg')
np.random.seed(20180908)
rows = 20500
cols = 10
# Randomly generate Z
Z = 100*np.random.uniform(0.05, 1.0, size = (rows, cols))
larger = np.max(Z[:, :cols/2], axis=1).reshape((rows, 1))
larger2 = np.max(Z[:, cols/2:], axis=1).reshape((rows, 1))
smaller = np.min((larger, larger2), axis=0)
# Z is now the max of the first half of the array.
Z = np.append(Z, larger, axis=1)
# Z is now the min of the max of each half of the array.
# Z = np.append(Z, smaller, axis=1)
# Combine and shuffle.
#Z = np.concatenate((Z_sum, Z_avg), axis = 0)
np.random.shuffle(Z)
## Training and validation data.
split = 10000
X_train = Z[:split, :-1]
X_valid = Z[split:, :-1]
Y_train = Z[:split, -1:].reshape(split, 1)
Y_valid = Z[split:, -1:].reshape(rows - split, 1)
print(X_train.shape)
print(Y_train.shape)
print(X_valid.shape)
print(Y_valid.shape)
print("Now setting up the FNN")
## FNN model.
tick = time.time()
# Define model.
network_fnn = models.Sequential()
network_fnn.add(layers.Dense(32, activation = 'relu', input_shape = (X_train.shape[1],)))
network_fnn.add(Dense(1, activation = None))
# Compile model.
network_fnn.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Fit model.
history_fnn = network_fnn.fit(X_train, Y_train, epochs = 500, batch_size = 128, verbose = False,
validation_data = (X_valid, Y_valid))
tock = time.time()
print()
print(str('%.2f' % ((tock - tick) / 60)) + ' minutes.')
print("Now evaluating the FNN")
loss_fnn = history_fnn.history['loss']
val_loss_fnn = history_fnn.history['val_loss']
epochs_fnn = range(1, len(loss_fnn) + 1)
print("train loss: ", loss_fnn[-1])
print("validation loss: ", val_loss_fnn[-1])
plt.plot(epochs_fnn, loss_fnn, 'black', label = 'Training Loss')
plt.plot(epochs_fnn, val_loss_fnn, 'red', label = 'Validation Loss')
plt.title('FNN: Training and Validation Loss')
plt.legend()
plt.show()
plt.scatter(Y_train, network_fnn.predict(X_train), alpha = 0.1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('training points')
plt.show()
plt.scatter(Y_valid, network_fnn.predict(X_valid), alpha = 0.1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('valid points')
plt.show()
print("LSTM")
## LSTM model.
X_lstm_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_lstm_valid = X_valid.reshape(X_valid.shape[0], X_valid.shape[1], 1)
tick = time.time()
# Define model.
network_lstm = models.Sequential()
network_lstm.add(layers.LSTM(32, activation = 'relu', input_shape = (X_lstm_train.shape[1], 1)))
network_lstm.add(layers.Dense(1, activation = None))
# Compile model.
network_lstm.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Fit model.
history_lstm = network_lstm.fit(X_lstm_train, Y_train, epochs = 500, batch_size = 128, verbose = False,
validation_data = (X_lstm_valid, Y_valid))
tock = time.time()
print()
print(str('%.2f' % ((tock - tick) / 60)) + ' minutes.')
print("now eval")
loss_lstm = history_lstm.history['loss']
val_loss_lstm = history_lstm.history['val_loss']
epochs_lstm = range(1, len(loss_lstm) + 1)
print("train loss: ", loss_lstm[-1])
print("validation loss: ", val_loss_lstm[-1])
plt.plot(epochs_lstm, loss_lstm, 'black', label = 'Training Loss')
plt.plot(epochs_lstm, val_loss_lstm, 'red', label = 'Validation Loss')
plt.title('LSTM: Training and Validation Loss')
plt.legend()
plt.show()
plt.scatter(Y_train, network_lstm.predict(X_lstm_train), alpha = 0.1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('training')
plt.show()
plt.scatter(Y_valid, network_lstm.predict(X_lstm_valid), alpha = 0.1)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title("validation")
plt.show()

Categories

Resources