hello like a title i try to using synthetic package for Time series GAN
at the first time i was thinking putting integer then output also numerical but it wasn't, output data are decimal number i using ydata-synthetic (https://github.com/ydataai/ydata-synthetic)
here is my code for make data please help me
#Importing the required libs for the exercise
from os import path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_synthetic.synthesizers import ModelParameters
from ydata_synthetic.preprocessing.timeseries import processed_stock
from ydata_synthetic.synthesizers.timeseries import TimeGAN
import torch
arr_data = np.random.randint(0,600000,(100,1))
#Specific to TimeGANs
#stock_data
seq_len=20
n_seq = 1 #number of columns
hidden_dim=24
gamma=1
noise_dim = 32
dim = 128
batch_size = len(arr_data) - seq_len
log_step = 100
learning_rate = 5e-4
gan_args = ModelParameters(batch_size=batch_size,
lr=learning_rate,
noise_dim=noise_dim,
layers_dim=dim)
lst_temp = []
for i in range(0,len(arr_data) - seq_len):
_x = arr_data[i:i+20]
lst_temp.append(_x)
tens_rand_data = torch.tensor(lst_temp)
lst_rand_data = tens_rand_data.numpy()
synth = TimeGAN(model_parameters=gan_args, hidden_dim=24, seq_len=seq_len, n_seq=n_seq, gamma=1)
synth.train(lst_rand_data, train_steps=10)
synth_data = synth.sample(len(lst_rand_data))
print(synth_data.shape)
cols = ['Car price']
for j, col in enumerate(cols):
df = pd.DataFrame({'Real': lst_rand_data[-1][:, j],'Synthetic': synth_data[-1][:, j]})
df.plot(title = "Car price",secondary_y='Synthetic data', style=['-', '--'])
print(df)
enter image description here
Your input should be processed using a MinMaxScaler before fitting into TimeGAN, and you will always receive decimal output between 0 and 1 due to sigmoid activation on the last layer of its Generator.
You can change your code in 2 ways:
Change your input from integer to decimal range [0,1].
arr_data = np.random.randint(0,600000,(100,1))
into
arr_data = np.random.uniform(0,1,(100,1))
This way your dummy input doesn't need to be scaler since it's already in [0,1].
Use MinMaxScaler to scale your data
from sklearn.preprocessing import MinMaxScaler
arr_data = np.random.randint(0,600000,(100,1))
scaler = MinMaxScaler(feature_range = (0,1))
scaled_data = scaler.fit_transform(arr_data)
...
Please note that you will always receive decimal output from [0,1] when using TimeGAN. Now if you want to inverse synthetic data into integer, consider using inverse transform.
I found some code on SO which seems to work quite well.
This code, directly below, produces the plot, also below.
from sklearn import datasets
from sklearn import cluster
import plotly
plotly.offline.init_notebook_mode()
iris = datasets.load_iris()
kmeans = cluster.KMeans(n_clusters=5, random_state=42).fit(iris.data[:,0:1])
data = [plotly.graph_objs.Scatter(x=iris.data[:,0],
y=iris.data[:,1],
mode='markers',
marker=dict(color=kmeans.labels_)
)]
plotly.offline.iplot(data)
Now, I make a simple substitution in the code, to point to my own data, like this.
from sklearn import datasets
from sklearn import cluster
import plotly
plotly.offline.init_notebook_mode()
x = df[['Spend']]
y = df[['Revenue']]
kmeans = cluster.KMeans(n_clusters=5, random_state=42).fit(x,y)
data = [plotly.graph_objs.Scatter(x=df[['Spend']],
y=df[['Revenue']],
mode='markers',
marker=dict(color=kmeans.labels_))]
plotly.offline.iplot(data)
That gives me this plot.
Here is my data frame.
# Import pandas library
import pandas as pd
# initialize list of lists
data = [[110,'CHASE CENTER',53901,8904,44997,4], [541,'METS STADIUM',57999,4921,53078,1], [538,'DEN BRONCOS',91015,9945,81070,1], [640,'LAMBEAU WI',76214,5773,70441,3], [619,'SAL AIRPORT',93000,8278,84722,5]]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Location', 'Location_Description', 'Revenue','Spend','Profit_Or_Loss','cluster_number'])
# print dataframe.
df
I must be missing something silly, but I don't see what it is.
You have a problem with the dimension:
# In the iris dataset
>>> iris.data[:,0].shape
(150,)
# Your data
>>> x.shape
(5, 1)
# You need to flatter your array
x.values.flatten().shape
(5,)
For example:
from sklearn import datasets
from sklearn import cluster
import plotly
plotly.offline.init_notebook_mode()
x = df[['Spend']]
y = df[['Revenue']]
x_flat = x.values.flatten()
y_flat = y.values.flatten()
kmeans = cluster.KMeans(n_clusters=5, random_state=42).fit(x)
data = [plotly.graph_objs.Scatter(x=x_flat,
y=y_flat,
mode='markers',
marker=dict(color=kmeans.labels_))]
plotly.offline.iplot(data)
On the other hand cluster.KMeans.fit accepts an array (and not two as you are passing). You're going to have to convert them to something of of shape (n_samples, n_features):
X = np.zeros((x_flat.shape[0], 2))
X[:, 0] = x_flat
X[:, 1] = y_flat
# X.shape -> (5, 2)
kmeans = cluster.KMeans(n_clusters=5, random_state=42).fit(X)
I want to make forward forecasting for monthly times series of air pollution data such as what would be 3~6 months ahead of estimation on air pollution index. I tried scikit-learn models for forecasting and fitting data to the model works fine. But what I wanted to do is making a forward period estimate such as what would be 6 months ahead of the air pollution output index is going to be. In my current attempt, I could able to train the model by using scikit-learn. But I don't know how that forward forecasting can be done in python. To make a forward period estimate, what should I do? Can anyone suggest a possible workaround to do this? Any idea?
my attempt
import pandas as pd
from sklearn.preprocessing StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import BayesianRidge
url = "https://gist.githubusercontent.com/jerry-shad/36912907ba8660e11cd27be0d3e30639/raw/424f0891dc46d96cd5f867f3d2697777ac984f68/pollution.csv"
df = pd.read_csv(url, parse_dates=['dates'])
df.drop(columns=['Unnamed: 0'], inplace=True)
resultsDict={}
predictionsDict={}
split_date ='2017-12-01'
df_training = df.loc[df.index <= split_date]
df_test = df.loc[df.index > split_date]
df_tr = df_training.drop(['pollution_index'],axis=1)
df_te = df_test.drop(['pollution_index'],axis=1)
scaler = StandardScaler()
scaler.fit(df_tr)
X_train = scaler.transform(df_tr)
y_train = df_training['pollution_index']
X_test = scaler.transform(df_te)
y_test = df_test['pollution_index']
X_train_df = pd.DataFrame(X_train,columns=df_tr.columns)
X_test_df = pd.DataFrame(X_test,columns=df_te.columns)
reg = linear_model.BayesianRidge()
reg.fit(X_train, y_train)
yhat = reg.predict(X_test)
resultsDict['BayesianRidge'] = accuracy_score(df_test['pollution_index'], yhat)
new update 2
this is my attempt using ARMA model
from statsmodels.tsa.arima_model import ARIMA
index = len(df_training)
yhat = list()
for t in tqdm(range(len(df_test['pollution_index']))):
temp_train = df[:len(df_training)+t]
model = ARMA(temp_train['pollution_index'], order=(1, 1))
model_fit = model.fit(disp=False)
predictions = model_fit.predict(start=len(temp_train), end=len(temp_train), dynamic=False)
yhat = yhat + [predictions]
yhat = pd.concat(yhat)
resultsDict['ARMA'] = evaluate(df_test['pollution_index'], yhat.values)
but this can't help me to make forward forecasting of estimating my time series data. what I want to do is, what would be 3~6 months ahead of estimated values of pollution_index. Can anyone suggest me a possible workaround to do this? How to overcome the limitation of my current attempt? What should I do? Can anyone suggest me a better way of doing this? Any thoughts?
update: goal
for the clarification, I am not expecting which model or approach works best, but what I am trying to figure it out is, how to make reliable forward forecasting for given time series (pollution index), how should I correct my current attempt if it is not efficient and not ready to do forward period estimation. Can anyone suggest any possible way to do this?
update-desired output
here is my sketch desired forecasting plot that I want to make:
In order to obtain your desired output, I think you need to use a model that can return the standard deviation in the predicted value. Therefore, I adopt Gaussian process regression. From the code you provided in your post, I don't see how this is a time series forecasting task, so in my solution below, I also treat this task as a usual regression task.
First, prepare the data
import pandas
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
url = "https://gist.githubusercontent.com/jerry-shad/36912907ba8660e11cd27be0d3e30639/raw/424f0891dc46d96cd5f867f3d2697777ac984f68/pollution.csv"
df = pd.read_csv(url,parse_dates=['date'])
df.drop(columns=['Unnamed: 0'],axis=1,inplace=True)
# sort the dataframe by date and reset the index
df = df.sort_values(by='date').reset_index(drop=True)
# after sorting the dataframe, split the dataframe
split_date ='2017-12-01'
df_training = df.loc[(df.date <= split_date).values]
df_test = df.loc[(df.date > split_date).values]
# drop the date column
df_training.drop(columns=['date'],axis=1,inplace=True)
df_test.drop(columns=['date'],axis=1,inplace=True)
y_train = df_training['pollution_index']
y_test = df_test['pollution_index']
df_training.drop(['pollution_index'],axis=1)
df_test.drop(['pollution_index'],axis=1)
scaler = StandardScaler()
scaler.fit(df_training)
X_train = scaler.transform(df_training)
X_test = scaler.transform(df_test)
X_train_df = pd.DataFrame(X_train,columns=df_training.columns)
X_test_df = pd.DataFrame(X_test,columns=df_test.columns)
with the dataframes prepared above, you can train a GaussianProcessRegressor and make predictions by
gpr = GaussianProcessRegressor(normalize_y=True).fit(X_train_df,y_train)
pred,std = gpr.predict(X_test_df,return_std=True)
in which std is an array of standard deviations in the predicted values. Then, you can plot the data by
import numpy as np
from matplotlib import pyplot as plt
fig,ax = plt.subplots(figsize=(12,8))
plot_start = 225
# plot the training data
ax.plot(y_train.index[plot_start:],y_train.values[plot_start:],'navy',marker='o',label='observed')
# plot the test data
ax.plot(y_test.index,y_test.values,'navy',marker='o')
ax.plot(y_test.index,pred,'darkgreen',marker='o',label='pred')
sigma = np.sqrt(std)
ax.fill(np.concatenate([y_test.index,y_test.index[::-1]]),
np.concatenate([pred-1.960*sigma,(pred+1.9600*sigma)[::-1]]),
alpha=.5,fc='silver',ec='tomato',label='95% confidence interval')
ax.legend(loc='upper left',prop={'size':16})
the output plot looks like
UPDATE
I thought pollution_index is something that can be predicted by 'dew', 'temp', 'press', 'wnd_spd', 'rain'. If you want a one-step ahead forecasting, here is what you can do
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
url = "https://gist.githubusercontent.com/jerry-shad/36912907ba8660e11cd27be0d3e30639/raw/424f0891dc46d96cd5f867f3d2697777ac984f68/pollution.csv"
df = pd.read_csv(url,parse_dates=['date'])
df.drop(columns=['Unnamed: 0'],axis=1,inplace=True)
# sort the dataframe by date and reset the index
df = df.sort_values(by='date').reset_index(drop=True)
# after sorting the dataframe, split the dataframe
split_date ='2017-12-01'
df_training = df.loc[(df.date <= split_date).values]
df_test = df.loc[(df.date > split_date).values]
# extract the relevant info
train_date,train_polltidx = df_training['date'].values,df_training['pollution_index'].values
test_date,test_polltidx = df_test['date'].values,df_test['pollution_index'].values
# train an ARIMA model
model = ARIMA(train_polltidx,order=(1,1,1))
model_fit = model.fit(disp=0)
# you can predict as many as you want, here I only predict len(test_dat.index) days
forecast,stderr,conf = model_fit.forecast(len(test_date))
# plot the result
fig,ax = plt.subplots(figsize=(12,8))
plot_start = 225
# plot the training data
plt.plot(train_date[plot_start:],train_polltidx[plot_start:],'navy',marker='o',label='observed')
# plot the test data
plt.plot(test_date,test_polltidx,'navy',marker='o')
plt.plot(test_date,forecast,'darkgreen',marker='o',label='pred')
# ax.errorbar(np.arange(len(pred)),pred,std,fmt='r')
plt.fill(np.concatenate([test_date,test_date[::-1]]),
np.concatenate((conf[:,0],conf[:,1][::-1])),
alpha=.5,fc='silver',ec='tomato',label='95% confidence interval')
plt.legend(loc='upper left',prop={'size':16})
ax = plt.gca()
ax.set_xlim([df_training['date'].values[plot_start],df_test['date'].values[-1]])
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gcf().autofmt_xdate()
plt.show()
The output figure is
Clearly, the prediction is very bad, because I haven't done any preprocessing to the training data.
UPDATE 2
Since I'm not familiar with ARIMA, I implement one-step forecasting using GaussianProcessRegressor with the help of this wonderful post.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import StandardScaler
url = "https://gist.githubusercontent.com/jerry-shad/36912907ba8660e11cd27be0d3e30639/raw/424f0891dc46d96cd5f867f3d2697777ac984f68/pollution.csv"
df = pd.read_csv(url,parse_dates=['date'])
df.drop(columns=['Unnamed: 0'],axis=1,inplace=True)
# sort the dataframe by date and reset the index
df = df.sort_values(by='date').reset_index(drop=True)
# after sorting the dataframe, split the dataframe
split_date ='2017-12-01'
df_training = df.loc[(df.date <= split_date).values]
df_test = df.loc[(df.date > split_date).values]
# extract the relevant info
train_date,train_polltidx = df_training['date'].values,df_training['pollution_index'].values[:,None]
test_date,test_polltidx = df_test['date'].values,df_test['pollution_index'].values[:,None]
# preprocessing
scalar = StandardScaler()
scalar.fit(train_polltidx)
train_polltidx = scalar.transform(train_polltidx)
test_polltidx = scalar.transform(test_polltidx)
def series_to_supervised(data,n_in,n_out):
df = pd.DataFrame(data)
cols = list()
for i in range(n_in,0,-1): cols.append(df.shift(i))
for i in range(0, n_out): cols.append(df.shift(-i))
agg = pd.concat(cols,axis=1)
agg.dropna(inplace=True)
return agg.values
months_look_back = 1
# train
pollt_series = series_to_supervised(train_polltidx,months_look_back,1)
x_train,y_train = pollt_series[:,:months_look_back],pollt_series[:,-1]
# test
pollt_series = series_to_supervised(test_polltidx,months_look_back,1)
x_test,y_test = pollt_series[:,:months_look_back],pollt_series[:,-1]
print("The first %i months in the test set won't be predicted." % months_look_back)
def walk_forward_validation(x_train,y_train,x_test,y_test):
predictions = []
history_x = x_train.tolist()
history_y = y_train.tolist()
for rep,target in zip(x_test,y_test):
# train model
gpr = GaussianProcessRegressor(alpha=1e-4,normalize_y=False).fit(history_x,history_y)
pred,std = gpr.predict([rep],return_std=True)
predictions.append([pred,std])
history_x.append(rep)
history_y.append(target)
return predictions
predictions = walk_forward_validation(x_train,y_train,x_test,y_test)
pred_test,pred_std = zip(*predictions)
# put back
pred_test = scalar.inverse_transform(pred_test)
pred_std = scalar.inverse_transform(pred_std)
train_polltidx = scalar.inverse_transform(train_polltidx)
test_polltidx = scalar.inverse_transform(test_polltidx)
# plot the result
fig,ax = plt.subplots(figsize=(12,8))
plot_start = 100
# plot the training data
plt.plot(train_date[plot_start:],train_polltidx[plot_start:],'navy',marker='o',label='observed')
# plot the test data
plt.plot(test_date[months_look_back:],test_polltidx[months_look_back:],'navy',marker='o')
plt.plot(test_date[months_look_back:],pred_test,'darkgreen',marker='o',label='pred')
sigma = np.sqrt(pred_std)
ax.fill(np.concatenate([test_date[months_look_back:],test_date[months_look_back:][::-1]]),
np.concatenate([pred_test-1.960*sigma,(pred_test+1.9600*sigma)[::-1]]),
alpha=.5,fc='silver',ec='tomato',label='95% confidence interval')
plt.legend(loc='upper left',prop={'size':16})
ax = plt.gca()
ax.set_xlim([df_training['date'].values[plot_start],df_test['date'].values[-1]])
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gcf().autofmt_xdate()
plt.show()
The idea of this script is to cast the time series forecasting task into a supervised regression task. The plot_start is a parameter that controls from which year we want to plot, clearly plot_start cannot be greater than the length of the training data. The output figure of the script is
as you can see, the first month in the test dataset is not predicted, because we need to look back one month to make a prediction.
In order to further make predictions about unseen data, based on this post on CV site, you can train a new model using the predicted value from the last step, therefore, here is how you can do it
unseen_dates = pd.date_range(test_date[-1],periods=180,freq='D').values
all_data = series_to_supervised(df['pollution_index'].values,months_look_back,months_to_predict)
def predict_unseen(unseen_dates,all_data,days_look_back):
predictions = []
history_x = all_data[:,:days_look_back].tolist()
history_y = all_data[:,-1].tolist()
inds = np.arange(unseen_dates.shape[0])
for ind in inds:
# train model
gpr = GaussianProcessRegressor(alpha=1e-2,normalize_y=False).fit(history_x,history_y)
rep = np.array(history_y[-days_look_back:]).reshape(days_look_back,1)
pred,std = gpr.predict(rep,return_std=True)
predictions.append([pred,std])
history_x.append(history_y[-days_look_back:])
history_y.append(pred)
return predictions
predictions = predict_unseen(unseen_dates,all_data,days_look_back=1)
pred_test,pred_std = zip(*predictions)
fig,ax = plt.subplots(figsize=(12,8))
plot_start = 100
# plot the test data
plt.plot(unseen_dates,pred_test,'navy',marker='o')
sigma = np.sqrt(pred_std)
ax.fill(np.concatenate([unseen_dates,unseen_dates[::-1]]),
np.concatenate([pred_test-1.960*sigma,(pred_test+1.9600*sigma)[::-1]]),
alpha=.5,fc='silver',ec='tomato',label='95% confidence interval')
plt.legend(loc='upper left',prop={'size':16})
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gcf().autofmt_xdate()
plt.show()
One very important thing to note: The timestep of the real data is a month, using such data to make predictions about days may not be correct.
The model you have built links what you are trying to model, 'pollution_index', to some input variables, in your case ['dew', 'temp', 'press', 'wnd_spd', 'rain']. So to predict pollution_index into the future using your model, at the high level, you need to estimate what these variables would be over the next 3-6 months, and then run your model on that. Practically, you need to come up with something that looks like X_test but has your projections for these variables for the future, and then call:
yhat = reg.predict(X_test)
... to produce the model estimate of where the pollution_index will be. Hope this makes sense. This gives you a "mechanical" ability to use your model for prediction.
For example, following up on your main example where reg is BayesianRidge() that you fit, we would do the following:
import sys
from io import StringIO
import matplotlib.pyplot as plt
# Here we load your predictions for input variables
# I stubbed it with some random data
df_predict_data = StringIO(
"""
date,dew,temp,press,wnd_spd,rain
2021-01-01,59,28,16,0.78,98.7
2021-02-01,68,32,18,0.79,46.1
2021-03-01,75,34,20,0.81,91.5
2021-04-01,63,31,16,0.83,19.1
2021-05-01,74,38,19,0.83,21.8
2021-06-01,65,32,17,0.85,35.4
""")
df_predict = pd.read_csv(df_predict_data, index_col = 'date')
# scale it using the same scaler you used in training
X_predict = scaler.transform(df_predict)
# predict pollution_index
y_predict = reg.predict(X_predict)
# plot it
plt.plot(df_predict.index, y_predict, '.-')
So we get this:
Whether the linear regression you built is a good model for such prediction is a completely different question. As #Sergey Bushmanov mentioned there is vast literature on forecasting and what models are best for this or that, and this thread is probably not the right place to debate that aspect of your question.
I wrote a Python script that uses scikit-learn to fit Gaussian Processes to some data.
IN SHORT: the problem I am facing is that while the Gaussian Processses seem to learn very well the training dataset, the predictions for the testing dataset are off, and it seems to me there is a problem of normalization behind this.
IN DETAIL: my training dataset is a set of 1500 time series. Each time series has 50 time components. The mapping learnt by the Gaussian Processes is between a set of three coordinates x,y,z (which represent the parameters of my model) and one time series. In other words, there is a 1:1 mapping between x,y,z and one time series, and the GPs learn this mapping. The idea is that, by giving to the trained GPs new coordinates, they should be able to give me the predicted time series associated to those coordinates.
Here is my code:
from __future__ import division
import numpy as np
from matplotlib import pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
coordinates_training = np.loadtxt(...) # read coordinates training x, y, z from file
coordinates_testing = np.loadtxt(..) # read coordinates testing x, y, z from file
# z-score of the coordinates for the training and testing data.
# Note I am using the mean and std of the training dataset ALSO to normalize the testing dataset
mean_coords_training = np.zeros(3)
std_coords_training = np.zeros(3)
for i in range(3):
mean_coords_training[i] = coordinates_training[:, i].mean()
std_coords_training[i] = coordinates_training[:, i].std()
coordinates_training[:, i] = (coordinates_training[:, i] - mean_coords_training[i])/std_coords_training[i]
coordinates_testing[:, i] = (coordinates_testing[:, i] - mean_coords_training[i])/std_coords_training[i]
time_series_training = np.loadtxt(...)# reading time series of training data from file
number_of_time_components = np.shape(time_series_training)[1] # 100 time components
# z_score of the time series
mean_time_series_training = np.zeros(number_of_time_components)
std_time_series_training = np.zeros(number_of_time_components)
for i in range(number_of_time_components):
mean_time_series_training[i] = time_series_training[:, i].mean()
std_time_series_training[i] = time_series_training[:, i].std()
time_series_training[:, i] = (time_series_training[:, i] - mean_time_series_training[i])/std_time_series_training[i]
time_series_testing = np.loadtxt(...)# reading test data from file
# the number of time components is the same for training and testing dataset
# z-score of testing data, again using mean and std of training data
for i in range(number_of_time_components):
time_series_testing[:, i] = (time_series_testing[:, i] - mean_time_series_training[i])/std_time_series_training[i]
# GPs
pred_time_series_training = np.zeros((np.shape(time_series_training)))
pred_time_series_testing = np.zeros((np.shape(time_series_testing)))
# Instantiate a Gaussian Process model
kernel = 1.0 * Matern(nu=1.5)
gp = GaussianProcessRegressor(kernel=kernel)
for i in range(number_of_time_components):
print("time component", i)
# Fit to data using Maximum Likelihood Estimation of the parameters
gp.fit(coordinates_training, time_series_training[:,i])
# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred_train, sigma_train = gp.predict(coordinates_train, return_std=True)
y_pred_test, sigma_test = gp.predict(coordinates_test, return_std=True)
pred_time_series_training[:,i] = y_pred_train*std_time_series_training[i] + mean_time_series_training[i]
pred_time_series_testing[:,i] = y_pred_test*std_time_series_training[i] + mean_time_series_training[i]
# plot training
fig, ax = plt.subplots(5, figsize=(10,20))
for i in range(5):
ax[i].plot(time_series_training[100*i], color='blue', label='Original training')
ax[i].plot(pred_time_series_training[100*i], color='black', label='GP predicted - training')
# plot testing
fig, ax = plt.subplots(5, figsize=(10,20))
for i in range(5):
ax[i].plot(features_time_series_testing[100*i], color='blue', label='Original testing')
ax[i].plot(pred_time_series_testing[100*i], color='black', label='GP predicted - testing')
Here examples of performance on the training data.
Here examples of performance on the testing data.
first you should use the sklearn preprocessing tool to treat your data.
from sklearn.preprocessing import StandardScaler
There are other useful tools to organaize but this specific one its to normalize the data.
Second you should normalize the training set and the test set with the same parameters¡¡ the model will fit the "geometry" of the data to define the parameters, if you train the model with other scale its like use the wrong system of units.
scale = StandardScaler()
training_set = scale.fit_tranform(data_train)
test_set = scale.transform(data_test)
this will use the same tranformation in the sets.
and finaly you need to normalize the features not the traget, I mean to normalize the X entries not the Y output, the normalization helps the model to find the answer faster changing the topology of the objective function in the optimization process the outpu doesnt affect this.
I hope this respond your question.
Hej,
I have a dataset from different cohorts and I want to bicluster them with the sklearn function Spectral Biclustering.
As you can see in the link above this approach is using a kind of normalization to calculate the SVD.
Is it necessary to normalize the data before biclustering, eg with StandardScaling (zero mean and std of one)? Because the function above still uses a kind of normalization.
Is that enough or do I have to normalise them before, eg when the data is coming from different distributions?
I am getting different results with and without standardscaling and I can not find information in the original paper if it is necessary or not.
You can find the code and an example of my dataset. This is real data so I do not know the truth. I calculated at the end the consensus score to compare the 2 biclusters. Unfortunately the clusters are not the same.
I tried it also with artificial data (see example last link) and here the results are the same, but not with the real data.
So how do I know which approach is the right one?
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.metrics import consensus_score
from sklearn.preprocessing import StandardScaler
n_clusters = (4, 4)
data_org = pd.read_csv('raw_data_biclustering.csv', sep=',', index_col=0)
# scale data & transform to dataframe
data_scaled = StandardScaler().fit_transform(data_org)
data_scaled = pd.DataFrame(data_scaled, columns=data_org.columns, index=data_org.index)
# plot original clusters
plt.imshow(data_scaled, aspect='auto', vmin=-3, vmax=5)
plt.title("Original dataset")
plt.show()
data_type = ['none_scaled', 'scaled']
data_all = [data_org, data_scaled]
models_all = []
for name, data in zip(data_type,data_all):
# spectral biclustering on the shuffled dataset
model = SpectralBiclustering(n_clusters=n_clusters, method='bistochastic'
, svd_method='randomized', n_jobs=-1
, random_state=0
)
model.fit(data)
newOrder_row = [list(r) for r in zip(model.row_labels_, data.index)]
newOrder_row.sort(key=lambda k: (k[0], k[1]), reverse=False)
order_row = [i[1] for i in newOrder_row]
newOrder_col = [list(c) for c in zip(model.column_labels_, [int(x) for x in data.keys()])]
newOrder_col.sort(key=lambda k: (k[0], k[1]), reverse=False)
order_col = [i[1] for i in newOrder_col]
# reorder the data matrix
X_plot = data_scaled.copy()
X_plot = X_plot.reindex(order_row) # rows
X_plot = X_plot[[str(x) for x in order_col]] # columns
# use clustermap without clustering
cm=sns.clustermap(X_plot, method=None, metric=None, cmap='viridis'
,row_cluster=False, row_colors=None
, col_cluster=False, col_colors=None
, yticklabels=1, xticklabels=1
, standard_scale=None, z_score=None, robust=False
, vmin=-3, vmax=5
)
ax = cm.ax_heatmap
# set labelsize smaller
cm_ax = plt.gcf().axes[-2]
cm_ax.tick_params(labelsize=5.5)
# plot lines for the different clusters
hor_lines = [sum(item) for item in model.biclusters_[0]]
hor_lines = list(np.cumsum(hor_lines[::n_clusters[1]]))
ver_lines = [sum(item) for item in model.biclusters_[1]]
ver_lines = list(np.cumsum(ver_lines[:n_clusters[0]]))
for pp in range(len(hor_lines)-1):
cm.ax_heatmap.hlines(hor_lines[pp],0,X_plot.shape[1], colors='r')
for pp in range(len(ver_lines)-1):
cm.ax_heatmap.vlines(ver_lines[pp],0,X_plot.shape[0], colors='r')
# title
title = name+' - '+str(n_clusters[1])+'-'+str(n_clusters[0])
plt.title(title)
cm.savefig(title,dpi=300)
plt.show()
# save models
models_all.append(model)
# compare models
score = consensus_score(models_all[0].biclusters_, models_all[1].biclusters_)
print("consensus score between: {:.1f}".format(score))