Data frame inverse reshaping memory - python

So I have two dataframs that I'm using on a GRU model.
I had to reshape them in order to apply it, but at the end of the code I wanted to plot y_predicted with y_test. It works but what used to be the x-axis is now the y-axis and vice versa. I though that it was because the data was reshaped so I applied .reshape(1,-1) but every time I do it I run out of memory (25GB) and I really don't know how to fix this problem
EDIT:
So I have dont everything using sample data, and discovered that reshaping y_predict doesn't take memory at all. but this line uses all 25GB of memory for some reason y_test = y_scale.inverse_transform(y_test)
import pandas as pd
import tensorflow as tf
from keras.layers.core import Dense
from keras.layers.recurrent import GRU
from keras.models import Sequential
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorboardcolab import tensorboard
# Loading First Dataframe
df = pd.read_csv('IF 10 PERCENT.csv',index_col=None)
df2 = pd.read_csv('2019 10minutes IF 10 PERCENT.csv',index_col=None)
tbc=TensorBoardColab() # Tensorboard
X_train= df[['WindSpeed_mps','AmbTemp_DegC','RotorSpeed_rpm','RotorSpeedAve','NacelleOrientation_Deg','MeasuredYawError','Pitch_Deg','WindSpeed1','WindSpeed2','WindSpeed3','GeneratorTemperature_DegC','GearBoxTemperature_DegC']]
X_train=X_train.values
y_train= df['Power_kW']
y_train=y_train.values
X_test= df2[['WindSpeed_mps','AmbTemp_DegC','RotorSpeed_rpm','RotorSpeedAve','NacelleOrientation_Deg','MeasuredYawError','Pitch_Deg','WindSpeed1','WindSpeed2','WindSpeed3','GeneratorTemperature_DegC','GearBoxTemperature_DegC']]
X_test=X_test.values
y_test= df2['Power_kW']
y_test=y_test.values
# conversion to numpy array
# scaling values for model
x_scale = MinMaxScaler()
y_scale = MinMaxScaler()
X_train= x_scale.fit_transform(X_train)
y_train= y_scale.fit_transform(y_train.reshape(-1,1))
X_test= x_scale.fit_transform(X_test)
y_test= y_scale.fit_transform(y_test.reshape(-1,1))
X_train = X_train.reshape((-1,1,12))
X_test = X_test.reshape((-1,1,12))
# splitting train and test
# creating model using Keras
model = Sequential()
model.add(GRU(units=512, return_sequences=True, input_shape=(1,12)))
model.add(GRU(units=256, return_sequences=True))
model.add(GRU(units=256))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss=['mse'], optimizer='adam',metrics=['mae'])
model.summary()
history=model.fit(X_train, y_train, batch_size=256, epochs=1, validation_data=(X_test,y_test),validation_split=0.1, verbose=1, callbacks=[TensorBoardColabCallback(tbc)])
score = model.evaluate(X_test, y_test)
print('Score: {}'.format(score))
y_predicted = model.predict(X_test)
y_predicted = y_scale.inverse_transform(y_predicted)
y_test = y_scale.inverse_transform(y_test)
plt.plot( y_predicted, label='Predicted')
plt.plot( y_test, label='Measurements')
plt.legend()
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
Thank you.

Related

Need to predict into the future, past training and test data, for financial time series. Keras/Tensorflow/LSTM model

Calling the necessary libraries
import math
import plotly.graph_objects as go
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datetime import datetime
Getting the times series data that I need
#Interval required 5 minutes
dataset_train = yf.download(tickers='EURUSD=x', period='5d', interval='15m')
#Print data
dataset_train'
Plotting time series data
plt.figure(figsize=(15, 8))
plt.title('EURUSD History')
plt.plot(dataset_train['Close'])
plt.xlabel('Date')
plt.ylabel('Prices ($)')
setting the training data parameters
close_prices = dataset_train['Close']
values = close_prices.values
training_data_len = math.ceil(len(values)* 0.8)
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(values.reshape(-1,1))
train_data = scaled_data[0: training_data_len, :]
x_train = []
y_train = []
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
Test data parameters
test_data = scaled_data[training_data_len-60: , : ]
x_test = []
y_test = values[training_data_len:]
for i in range(60, len(test_data)):
x_test.append(test_data[i-60:i, 0])
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
The LSTM Model
model = keras.Sequential()
model.add(layers.LSTM(100, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(layers.LSTM(100, return_sequences=False))
model.add(layers.Dense(25))
model.add(layers.Dense(1))
model.summary()
Fitting the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, batch_size= 1, epochs=3)
Carrying out the predictions
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
rmse = np.sqrt(np.mean(predictions - y_test)**2)
rmse
Plotting the model outcomes
data = dataset_train.filter(['Close'])
train = data[:training_data_len]
validation = data[training_data_len:]
validation['Predictions'] = predictions
plt.figure(figsize=(16,8))
plt.title('Model')
plt.xlabel('Date')
plt.ylabel('Close Price USD ($)')
plt.plot(train)
plt.plot(validation[['Close', 'Predictions']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()
print(train)
print(validation[['Close', 'Predictions']])
That's the code for the model, training, and test data predictions. I just need to know how to make real time predictions into the actual future for 3 15 min bars of data in advance.

Elegant way to plot average loss of many trains in tensorflow

I am running many iterations of a train so I can smooth out the loss curves. I would like an elegant way to average all the losses from history.history['loss'] but haven't found an easy way to do it. Here's a minimal example:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from matplotlib import pyplot as plt
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(60000, 784).astype('float32')/255
y_train = to_categorical(y_train, num_classes=10)
def get_model():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(10, activation='sigmoid',
input_shape=(784,)))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer="sgd",
metrics = ['accuracy'])
return model
all_trains = []
for i in range(3):
model = get_model()
history = model.fit(x_train, y_train, epochs=2)
all_trains.append(history)
If I wanted to plot just one example, I would do this:
plt.plot(history.epoch, history.history['loss'])
plt.show()
But instead, I want to average the loss from each train in all_trains and plot them. I can think of many clunky ways to do it but would like to find a clean way.
You could simply do:
import numpy as np
import matplotlib.pyplot as plt
losses = [h.history['loss'] for h in all_trains]
mean_loss = np.mean(losses, axis=0)
std = np.std(losses, axis=0)
plt.errorbar(range(len(mean_loss)), mean_loss, yerr=std, capsize=5, marker='o')
plt.title('Average loss per epoch (± std)')
plt.xlabel('Epoch')
plt.ylabel('Categorical crossentropy')
plt.show()
I also added the standard deviation in this case.

Confusion Matrix - ValueError: Found input variables with inconsistent numbers of samples

For the sake of reproducibility, the training and validations datasets I am using are shared here
The validation_dataset.csv is the ground truth of training_dataset.csv.
What I am doing below is feeding the datasets into a simple CNN layer that extracts the useful features of the images and feed that as 1D into the LSTM network for classification.
from keras.models import Sequential
from keras.layers import Dense, Flatten, Activation
from keras.layers.convolutional import Conv1D
from keras.layers import LSTM
from keras.layers.convolutional import MaxPooling1D
from keras.layers import TimeDistributed
from keras.layers import Dropout
from keras import optimizers
from keras.callbacks import EarlyStopping
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from confusion_matrix import plot_confusion_matrix
import scikitplot as skplt
from numpy import genfromtxt
train_set = genfromtxt('data/train/training_dataset.csv', delimiter=',')
validation_set = genfromtxt('data/validation/validation_dataset.csv', delimiter=',')
train_set = train_set[..., None]
validation_set = validation_set[..., None]
X_train, X_test, y_train, y_test = train_test_split(train_set, validation_set, test_size=0.30, random_state=0)
batch_size=16
epochs=5
# Create the model
model = Sequential()
model.add(Conv1D(filters=5, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(10))
model.add(Dense(1,kernel_initializer='random_normal'))
model.add(Activation('relu'))
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0)
sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=adam, loss='mean_squared_error', metrics=['mae', 'mape', 'mean_squared_error', 'acc'])
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)
print(model.summary())
# Evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
skplt.metrics.plot_confusion_matrix(y_test, scores, x_tick_rotation=50, title=' ', normalize=True)
Finally, I want to plot the confusion matrix of the model using
skplt.metrics.plot_confusion_matrix(y_test, scores, x_tick_rotation=50, title=' ', normalize=True)
However, it is raising an error ValueError: Found input variables with inconsistent numbers of samples: [5394, 5].
How can we fix this error?
The second argument to skplt.metrics.plot_confusion_matrix must be the predicted labels (see https://scikit-plot.readthedocs.io/en/stable/metrics.html). But, you pass scores, which does not contain the predicted labels.
The fix would be to do:
y_pred = model.predict(X_test)
skplt.metrics.plot_confusion_matrix(y_test,
y_pred,
x_tick_rotation=50,
title=' ',
normalize=True)
I was working on SVM few days ago and when i tried to plot confusion matrix the following lines of code worked for me.
predicted=model.predict(X_test) #predicted output
cm=metrics.confusion_matrix(y_test, predicted)
df_cm = pd.DataFrame(cm, range(2), range(2))
sns.set(font_scale=1.4)
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.title('CONFUSION MATRIX ',fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.show()

Tensorflow model accuracy low

So my main goal is to use data from 2018 and try to predict data for 2019. I'm using a GRU model and I have the following code. I have a few issues, I'm not sure if the code is actually correct or if I am missing something, and also for model.fit should I use validation_split=0.1 or validation_data=X_test,y_test since I'm using a different dataframe for tesing.
Regarding the accuracy, it is very small and doesn't make any sense and I have no idea why.
import pandas as pd
import tensorflow as tf
from keras.layers.core import Dense
from keras.layers.recurrent import GRU
from keras.models import Sequential
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback
df = pd.read_csv('IF 10 PERCENT.csv',index_col=None)
#Loading Second Dataframe
df2 = pd.read_csv('2019 10minutes IF 10 PERCENT.csv',index_col=None)
tbc=TensorBoardColab() # Tensorboard
X_train= df[['WindSpeed_mps','AmbTemp_DegC','RotorSpeed_rpm','RotorSpeedAve','NacelleOrientation_Deg','MeasuredYawError','Pitch_Deg','WindSpeed1','WindSpeed2','WindSpeed3','GeneratorTemperature_DegC','GearBoxTemperature_DegC']]
X_train=X_train.values
y_train= df['Power_kW']
y_train=y_train.values
X_test= df2[['WindSpeed_mps','AmbTemp_DegC','RotorSpeed_rpm','RotorSpeedAve','NacelleOrientation_Deg','MeasuredYawError','Pitch_Deg','WindSpeed1','WindSpeed2','WindSpeed3','GeneratorTemperature_DegC','GearBoxTemperature_DegC']]
X_test=X_test.values
y_test= df2['Power_kW']
y_test=y_test.values
# conversion to numpy array
# scaling values for model
x_scale = MinMaxScaler()
y_scale = MinMaxScaler()
X_train= x_scale.fit_transform(X_train)
y_train= y_scale.fit_transform(y_train.reshape(-1,1))
X_test=x_scale.fit_transform(X_test)
y_test=y_scale.fit_transform(y_test.reshape(-1,1))
X_train = X_train.reshape((-1,1,12))
X_test = X_test.reshape((-1,1,12))
# splitting train and test
# creating model using Keras
model = Sequential()
model.add(GRU(units=512, return_sequences=True, input_shape=(1,12)))
model.add(GRU(units=256, return_sequences=True))
model.add(GRU(units=256))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss=['mse'], optimizer='adam',metrics=['accuracy'])
model.summary()
#model.fit(X_train, y_train, batch_size=250, epochs=10, validation_split=0.1, verbose=1, callbacks=[TensorBoardColabCallback(tbc)])
model.fit(X_train, y_train, batch_size=250, epochs=10, validation_data=(X_test,y_test), verbose=1, callbacks=[TensorBoardColabCallback(tbc)])
score = model.evaluate(X_test, y_test)
print('Score: {}'.format(score))
print('Accuracy: {}'.format(acc))
y_predicted = model.predict(X_test)
y_predicted = y_scale.inverse_transform(y_predicted)
y_t
est = y_scale.inverse_transform(y_test)
plt.plot(y_predicted, label='Predicted')
plt.plot(y_test, label='Measurements')
plt.legend()
plt.show()
Thank you
It sounds to me that you are trying to solve a regression problem here. if it is so, It does not make sense to measure accuracy as a metric, since accuracy is about to measure the exact label matching. MSE should be pretty good for the regression

How can I implement the input of multiple regression in LSTM using keras?

here is my code
def create_dataset(signal_data, look_back=1):
dataX, dataY = [], []
for i in range(len(signal_data) - look_back):
dataX.append(signal_data[i:(i + look_back), 0])
dataY.append(signal_data[i + look_back, 0])
return np.array(dataX), np.array(dataY)
df = pd.read_csv('time_series.csv')
signal_data = df.Close.values.astype('float32')
signal_data = signal_data.reshape(len(df), 1)
scaler = MinMaxScaler(feature_range=(0, 1))
signal_data = scaler.fit_transform(signal_data)
train_size = int(len(signal_data) * 0.80)
test_size = len(signal_data) - train_size)
# val_size = len(signal_data) - train_size - test_size
train = signal_data[0:train_size]
# val = signal_data[train_size:train_size+val_size]
test = signal_data[train_size+val_size:len(signal_data)]
x_train, y_train = create_dataset(train, look_back)
# x_val, y_val = create_dataset(val, look_back)
x_test, y_test = create_dataset(test, look_back)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# x_val = np.reshape(x_val, (x_val.shape[0], x_val.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
now I want to add df.Open and df.High and df.Low and df.Volume
how can I implement this code?
Should I just add to the signal data? I'm wondering how to add data so that I can train multiple features in the signal data.
I don't know where and how to implement it. I need your help.
Your valuable opinions and thoughts will be very much appreciated.
I made several modifications to your code. This should work. In summary:
I got fixed the lines of code where you were barcoding the selection of the variable 0. Now, the target variable stands on the last position and the others in the previous ones
I fixed the reshapes some of them were not needed and the others were fixed to keep all the dimensions
I fixed the model input shape, now you have 5 variables instead of 1
My general recommendations:
I would not use MinMaxScaler, it is dangerous because a single outlier can disturb all your distribution. Instead, use StandardScaler. More info here: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
I would scale the data afterwards, when the train_x, test_x and their y respective counterparts are built. The reason why is because you are computing the statistics for scaling the data using the train and test set, i.e. future information. This is by all the means different to what you'll find when you try to run your code in a real situation. I.e. you'll have to scale the new data with past statistics. It is better to build a test set as close to the reality as possible.
How do you know that your model is big enough to model your data? I would get rid of the dropouts and run the model to see if it can overfit the data. If the model can overfit to the train data, it means that the model is big enough and you can start regularising your model to enhance generalisation. More info in this book: https://www.deeplearning.ai/machine-learning-yearning/
In the model metrics you choose accuracy, which is a classification metric. I would use one according to my type of problem (regression): for example "Mean Absolute Error".
I hope I managed to help you :D
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Conv2D, Reshape, TimeDistributed, Flatten, Conv1D,ConvLSTM2D, MaxPooling1D
from keras.layers.core import Dense, Activation, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import matplotlib.pyplot as plt
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
def create_dataset(signal_data, look_back=1):
dataX, dataY = [], []
for i in range(len(signal_data) - look_back):
dataX.append(signal_data[i:(i + look_back), :])
dataY.append(signal_data[i + look_back, -1])
return np.array(dataX), np.array(dataY)
look_back = 20
df = pd.read_csv('kospi.csv')
signal_data = df[["Open", "Low", "High", "Volume", "Close"]].values.astype('float32')
scaler = MinMaxScaler(feature_range=(0, 1))
signal_data = scaler.fit_transform(signal_data)
train_size = int(len(signal_data) * 0.80)
test_size = len(signal_data) - train_size - int(len(signal_data) * 0.05)
val_size = len(signal_data) - train_size - test_size
train = signal_data[0:train_size]
val = signal_data[train_size:train_size+val_size]
test = signal_data[train_size+val_size:len(signal_data)]
x_train, y_train = create_dataset(train, look_back)
x_val, y_val = create_dataset(val, look_back)
x_test, y_test = create_dataset(test, look_back)
model = Sequential()
model.add(LSTM(128, input_shape=(None, 5),return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(128, input_shape=(None, 5)))
model.add(Dropout(0.3))
model.add(Dense(128))
model.add(Dropout(0.3))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()
hist = model.fit(x_train, y_train, epochs=20, batch_size=32, verbose=2, validation_data=(x_val, y_val))
trainScore = model.evaluate(x_train, y_train, verbose=0)
model.reset_states()
print('Train Score: ', trainScore)
valScore = model.evaluate(x_val, y_val, verbose=0)
model.reset_states()
print('Validataion Score: ', valScore)
testScore = model.evaluate(x_test, y_test, verbose=0)
model.reset_states()
print('Test Score: ', testScore)
p = model.predict(x_test)
print(mean_squared_error(y_test, p))
import matplotlib.pyplot as pplt
pplt.plot(y_test)
pplt.plot(p)
pplt.legend(['testY', 'p'], loc='upper right')
pplt.show()

Categories

Resources