ARIMA model in Python - python

I am using ARIMA to do forecasting in Python, following are my code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
HSBC = pd.read_csv('HSBC.csv', index_col = 'Date', parse_dates = True)
HSBC2 = HSBC['Close']
result = seasonal_decompose(HSBC2, model='multiplicative', period = 1)
from pmdarima import auto_arima
import warnings
warnings.filterwarnings("ignore")
stepwise_fit = auto_arima(HSBC2, start_p = 1, start_q = 1,
max_p = 3, max_q = 3, m = 12,
start_P = 0, seasonal = True,
d = None, D = 1, trace = True,
error_action ='ignore',
suppress_warnings = True,
stepwise = True)
train = HSBC2[0:173]
test = HSBC2[173:248]
model = SARIMAX(train, order = (0, 1, 1), seasonal_order =(0,1,1,12))
result = model.fit()
start = len(train)
end = len(train) + len(test) - 1
prediction = result.predict(start,end,
typ = 'levels').rename("Predictions")
predictions.plot(legend = True)
test.plot(legend = True)
I am confusing that why the x-axis of prediction plot become number, which supposed to be date like that of test plot.

If I am not wrong, this is due you have not specify the frequency of your index. Try this:
HSBC.index = pd.date_range(freq='d', start=HSBC.index[0], periods=len(HSBC)
Beware that you should frequency='d' if your index is daily spaced
EDIT:
So, the answer was just changing the parameters start and end parameters of the predict method, e.g:
start = test['Date'].iloc[0]
end = test['Date'].iloc[-1]
prediction = result.predict(start,end,
typ = 'levels').rename("Predictions")

Related

for loop having issue with holt-winters exponential smoothing

when I run my individual models with different training and test data my model works fine. I wanted to run a for loop and now I am getting the error not sure why.
I have created several time splits to check how the model is performing with different data breakdowns.
# dataframe opertations - pandas
import pandas as pd
# plotting data - matplotlib
from matplotlib import pyplot as plt
# time series - statsmodels
# Seasonality decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import seasonal_decompose
# holt winters
# single exponential smoothing
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
# double and triple exponential smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from numpy import sqrt
from sklearn.metrics import mean_squared_error
df = pd.read_csv('/content/hw-cv-imputed.csv',index_col='date', parse_dates=True)
df.index.freq = 'W-FRI'
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[['visits']].plot(title='visit Data')
#Splitting according to the above description
train1, test1 = df.iloc[:52, 0], df.iloc[52:62, 0]
train2, test2 = df.iloc[:56, 0], df.iloc[56:66, 0]
train3, test3 = df.iloc[:60, 0], df.iloc[60:70, 0]
train4, test4 = df.iloc[:65, 0], df.iloc[65:75, 0]
train5, test5 = df.iloc[:69, 0], df.iloc[69:79, 0]
train6, test6 = df.iloc[:73, 0], df.iloc[73:83, 0]
train7, test7 = df.iloc[:78, 0], df.iloc[78:88, 0]
train8, test8 = df.iloc[:82, 0], df.iloc[82:90, 0]
total_model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
# Split into train and test set
#train_df = train1
#test_df = test1
from sklearn.model_selection import ParameterGrid
for train_df ,test_df in [('train1','test1'),('train2','test2'),('train3','test3'),('train4','test4'),('train5','test5'),('train6','test6'),('train7','test7')]:
params_grid = {'trend':('mul','add'),
'seasonal':('mul','add'),
'seasonal_periods': [10,12]}
grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
cnt = cnt+1
print('Total Possible Models',cnt)
model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
for p in grid:
test = pd.DataFrame()
print(p)
**fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()**
test_predictions = fitted_model.forecast(10)
df_new = pd.concat((test_df,test_predictions.rename('predicted_visits'),(((test_df-test_predictions)/test_df)*100).rename('error')),axis=1)
def accuracy(row):
if abs(row['error']) < 20:
return 1
return 0
df_new['accuracy'] = df_new.apply(lambda row: accuracy(row), axis=1)
Total = df_new['accuracy'].sum()
print('Accuracy------------------------------------',Total)
model_parameters = model_parameters.append({'Total':Total,'Parameters':p},ignore_index=True)
parameters = model_parameters.sort_values(by=['Total'],ascending=False)
parameters = parameters.reset_index(drop=True)
parameters.head(9)
Parameters_1 = pd.DataFrame(parameters)
Parameters_1
parameters['Parameters'][0]
total_model_parameters = total_model_parameters.append(parameters)
total_model_parameters
The error is
for the line - *fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()*
ValueError: unrecognized data structures: <class 'str'> / <class 'NoneType'>
Can someone help, please? :)
p.s. The data is as follows
date visits
1/22/2021 7352070
1/29/2021 7063725
2/5/2021 9385950
2/12/2021 7851435
2/19/2021 9509640
2/26/2021 9919170
3/5/2021 9682125
3/12/2021 9597075
3/19/2021 8189835
3/26/2021 7487385
4/2/2021 8863965
4/9/2021 8856165
4/16/2021 8619345
4/23/2021 4499670
4/30/2021 3642705
5/7/2021 3105690
5/14/2021 3096330
5/21/2021 3240360
5/28/2021 5152410
6/4/2021 6471915
6/11/2021 4401030
6/18/2021 3197775
6/25/2021 2606340
7/2/2021 3248460
7/9/2021 4996425
7/16/2021 7775085
7/23/2021 9690795
7/30/2021 10041555
8/6/2021 11849055
8/13/2021 14598750
8/20/2021 15339390
8/27/2021 20118720
9/3/2021 12731115
9/10/2021 17456475
9/17/2021 20393850
9/24/2021 20537895
10/1/2021 20800935
10/8/2021 25035450
10/15/2021 22872450
10/22/2021 22790130
10/29/2021 22036965
11/5/2021 26988975
11/12/2021 29194530
11/19/2021 26106000
11/26/2021 29928660
12/3/2021 29254335
12/10/2021 32165430
12/17/2021 27303570
12/24/2021 21453585
12/31/2021 21568815
1/7/2022 21286680
1/14/2022 25589715
1/21/2022 21890130
1/28/2022 20881515
2/4/2022 24185835
2/11/2022 24160590
2/18/2022 20253360
2/25/2022 20450910
3/4/2022 26542320
3/11/2022 25540335
3/18/2022 29602380
3/25/2022 32258340
4/1/2022 24953640
4/8/2022 22872165
4/15/2022 25784490
4/22/2022 25168356
4/29/2022 25405687
5/6/2022 24693295
5/13/2022 26374944
5/20/2022 26192271
5/27/2022 26868125
6/3/2022 27948287
6/10/2022 28320595
6/17/2022 28153788
6/24/2022 27470327
7/1/2022 30520950
7/8/2022 28635750
7/15/2022 26269140
7/22/2022 24236250
7/29/2022 20541675
8/5/2022 21190020
8/12/2022 22389675
8/19/2022 24496455
8/26/2022 27555645
9/2/2022 26324760
9/9/2022 32937450
9/16/2022 36577425
9/23/2022 33522000
9/30/2022 30759780
10/7/2022 30615870
The problem is that you have ' quoted your variable names so that
for train_df ,test_df in [('train1','test1'),...]
shouldn't have the 's.
You can do away with that line if you're happy to put your pairs of training and test data into a list of tuples like this
import pandas as pd
from sklearn.model_selection import ParameterGrid
from statsmodels.tsa.holtwinters import ExponentialSmoothing
df = pd.read_csv("hw-cv-imputed.csv", index_col="date", parse_dates=True)
df.index.freq = "W-FRI"
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[["visits"]].plot(title="visit Data")
# Splitting according to the above description
train_and_test = []
train_and_test.append((df.iloc[:52, 0], df.iloc[52:62, 0]))
train_and_test.append((df.iloc[:56, 0], df.iloc[56:66, 0]))
train_and_test.append((df.iloc[:60, 0], df.iloc[60:70, 0]))
train_and_test.append((df.iloc[:65, 0], df.iloc[65:75, 0]))
train_and_test.append((df.iloc[:69, 0], df.iloc[69:79, 0]))
train_and_test.append((df.iloc[:73, 0], df.iloc[73:83, 0]))
train_and_test.append((df.iloc[:78, 0], df.iloc[78:88, 0]))
train_and_test.append((df.iloc[:82, 0], df.iloc[82:90, 0]))
total_model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
for train_df, test_df in train_and_test:
params_grid = {
"trend": ("mul", "add"),
"seasonal": ("mul", "add"),
"seasonal_periods": [10, 12],
}
grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
cnt = cnt + 1
print("Total Possible Models", cnt)
model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
for p in grid:
...

using tf.keras.layers.Embedding for categorical variables in regression problem

Using the iris dataset as a hypothetical hello world example:
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['iris_class'] = pd.Series(iris['target'], name = 'target_values')
df['iris_class_name'] = df['iris_class'].replace([0,1,2], ['iris-' + species for species in iris['target_names'].tolist()])
df.columns = df.columns.str.replace("[() ]", "")
print(df.head())
Let us say I want to use tf.keras.layers.Embedding instead of one-hot/dummy encoding as part of ANN for regression. e.g.:
iris_class_name + sepalwidthcm + petallengthcm -> sepallengthcm
where sepallengthcm is the dependent variable. I came across this:
city_lookup = tf.keras.layers.StringLookup(vocabulary = city_vocabulary, mask_token = None);
city_embedding= tf.keras.Sequential([
city_lookup,
tf.keras.layers.Embedding(len(city_vocabulary) + 1, embedding_dimension)
], "city_embedding")
city = features["city"]
city_embedding_output = city_embedding(city)
but am not sure how to exactly use it in my use case. Any pointers very much welcome. Thanks!
You can map iris_class_name to n-dimensional vector representations and then concatenate with the other continuous features:
import pandas as pd
from sklearn import datasets
import numpy as np
import tensorflow as tf
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['iris_class'] = pd.Series(iris['target'], name = 'target_values')
df['iris_class_name'] = df['iris_class'].replace([0,1,2], ['iris-' + species for species in iris['target_names'].tolist()])
df.columns = df.columns.str.replace("[() ]", "")
vocab = df['iris_class_name'].unique()
embedding_dimension = 10
lookup = tf.keras.layers.StringLookup(vocabulary = vocab, mask_token = None)
embedding= tf.keras.Sequential([
lookup,
tf.keras.layers.Embedding(len(vocab) + 1, embedding_dimension)
])
names = df['iris_class_name'].to_numpy()
embedding_output = embedding(names)
features = np.concatenate((embedding_output, df[['sepalwidthcm', 'petallengthcm']].to_numpy()), axis=-1)
print(features.shape)
(150, 12)
Since you have 3 unique iris class names, you could also simply create an integer-to-vector dictionary manually, but it is up to you.

VIF function returns all 'inf' values

I'm handling with multicollinearity problem with variance_inflation_factor() function.
But after running the function, I found that the function returned all the scores as infinite values.
Here's my code:
from rdkit import Chem
import pandas as pd
import numpy as np
from numpy import array
data = pd.read_csv('Descriptors_raw.csv')
class_ = pd.read_csv('class_file.csv')
class_tot = pd.read_csv('class_total.csv')
mols_A1 = Chem.SDMolSupplier('finaldata_A1.sdf')
mols_A2 = Chem.SDMolSupplier('finaldata_A2.sdf')
mols_B = Chem.SDMolSupplier('finaldata_B.sdf')
mols_C = Chem.SDMolSupplier('finaldata_C.sdf')
mols = []
mols.extend(mols_A1)
mols.extend(mols_A2)
mols.extend(mols_B)
mols.extend(mols_C)
mols_df = pd.DataFrame(mols)
mols = pd.concat([mols_df, class_tot, data], axis=1)
mols = mols.dropna(axis=0, thresh=1400)
mols.groupby('target_name_quarter').mean()
fill_mean_func = lambda g: g.fillna(g.mean())
mols = mols.groupby('target_name_quarter').apply(fill_mean_func)
molfiles = mols.loc[:, :'target_quarter']
descriptors = mols.loc[:, 'nAcid':'Zagreb']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
fitted = scaler.fit(descriptors)
descriptors_scaled = scaler.transform(descriptors)
descriptors_scaled = pd.DataFrame(descriptors_scaled, columns=descriptors.columns, index = list(descriptors.index.values))
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
descriptors_del_lowvar = variance_threshold_selector(descriptors_scaled, 0.01)
mols = pd.concat([molfiles, descriptors_del_lowvar.loc[:, 'nAcid':'Zagreb']], axis=1)
mols.loc[:, 'nAcid':'Zagreb'].corr()
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
% matplotlib inline
sns.pairplot(mols[['apol', 'nAtom', 'nHeavyAtom', 'nH', 'nAcid']])
vif = pd.DataFrame()
des = mols.loc[:, 'nAcid':'Zagreb']
vif["VIF factor"] = [variance_inflation_factor(des.values, i) for i in range(des.shape[1])]
vif["features"] = des.columns
print(vif)
I used MinMaxScaler() when eliminate low-variance features so as to make all the variables in same range.
print(vif) returns a dataframe with all infinite values and I cannot figure out why.
Thank you in advance :)
This shows a perfect correlation between two independent variables. In the case of perfect correlation, we get R2 =1, which lead to 1/(1-R2) infinity. To solve this problem we need to drop one of the variables from the dataset which is causing this perfect multicollinearity.

Leave one out cross validation Support vector machine

We were given some code for a support vector machine where we are supposed to implement leave one out cross validation. If I understand it correctly leave one out will create as many test sets as there are samples, which means that for a big data set the process will be costly and most likely take quite long to generate results.
I have tried to implement leave one out to the given svm code with only one iteration and with 773 data points in total. I expected it to take some time but as of 2 h later the code is still running without any result, which makes me believe that it might be stuck in some loop or something...
Is there any suggestion as to what might be wrong? I'm not getting any error code either.
The entire code is as following, with the leave one out part is in the last function at the bottom (executed in jupyter notebook online binder):
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gseapy as gp
from gseapy.plot import gseaplot
import qvalue
from ipywidgets import interact, interact_manual
from ipywidgets import IntSlider, FloatSlider, Dropdown, Text
import sklearn as skl
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut
from sklearn import svm
interact_enrich=interact_manual.options(manual_name="Enrichment analysis")
interact_plot=interact_manual.options(manual_name="Plot")
interact_calc=interact_manual.options(manual_name="Calculate tests")
interact_gen=interact_manual.options(manual_name="Initialize data")
interact_SVM=interact_manual.options(manual_name="Train SVM")
clinical_data = pd.read_csv('../data/brca_clin.tsv.gz', sep ='\t', index_col=2)
clinical_data = clinical_data.iloc[4:,1:]
expression_data = pd.read_csv('../data/brca.tsv.gz', sep ='\t', index_col=1)
expression_data = expression_data.iloc[:,2:].T
def split_data(clinical_df, expression_df, separator, cond1, cond2):
try:
group1 = clinical_df[separator] == cond1
index1 = clinical_df[group1].index
group2 = clinical_df[separator] == cond2
index2 = clinical_df[group2].index
except:
print('Clinical condition wrong')
expression1 = expression_df.loc[index1].dropna()
expression2 = expression_df.loc[index2].dropna()
expression = pd.concat([expression1, expression2])
X = expression.values
y = np.append(np.repeat(0, len(expression1)), np.repeat(1, len(expression2)))
display(pd.DataFrame([len(index1),len(index2)], columns = ['Number of points'], index = ['Group 1', 'Group 2']))
return X, y
def plot_pca_variance(X, scale=False, ncomp = 1):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA()
pca.fit(X)
plt.rcParams["figure.figsize"] = (20,10)
sns.set(style='darkgrid', context='talk')
plt.plot(np.arange(1,len(pca.explained_variance_ratio_)+1),np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.vlines(ncomp, 0, plt.gca().get_ylim()[1], color='r', linestyles = 'dashed')
h = np.cumsum(pca.explained_variance_ratio_)[ncomp -1]
plt.hlines(h, 0, plt.gca().get_xlim()[1], color='r', linestyles = 'dashed')
plt.title(str(ncomp) + ' components, ' + str(round(h, 3)) + ' variance explained')
plt.show()
def reduce_data(X, n, scale=True):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA(n_components=n)
Xr = pca.fit_transform(X)
return Xr
def interact_split_data(Criteria, Group_1, Group_2):
global BRCA_X, BRCA_y
BRCA_X, BRCA_y = split_data(clinical_data, expression_data, Criteria, Group_1, Group_2)
def interact_SVM_1(Rescale, Max_iterations):
max_iter = int(Max_iterations)
loo = LeaveOneOut()
ac_matrix_train, ac_matrix_test = np.array([]), np.array([])
for train_id, test_id in loo.split(BRCA_X, BRCA_y):
X_train, X_test, y_train, y_test = BRCA_X[train_id,:], BRCA_X[test_id,:], BRCA_y[train_id],BRCA_y[test_id]
clf = svm.LinearSVC(C=0.1,max_iter=100000).fit(X_train, y_train) # Train an SVM
y_train_pred = clf.predict(X_train)
ac_matrix_train = confusion_matrix(y_train, y_train_pred)
y_test_pred = clf.predict(X_test)
ac_matrix_test = confusion_matrix(y_test, y_test_pred)
display(pd.DataFrame(np.concatenate((ac_matrix_train,ac_matrix_test), axis =1), columns = ["predicted G1 (training)","predicted G2 (training)", "predicted G1 (test)","predicted G2 (test)"],index=["actual G1","actual G2"]))
interact_gen(interact_split_data, Criteria=Text('PR status by ihc'), Group_1 = Text('Positive'), Group_2=Text('Negative'))
interact_SVM(interact_SVM_1, Rescale = False, Max_iterations = Text('1')) ```

Stumped by this error: TypeError: 'PCA' object is not callable

This error eludes me, because after running type(PCAdata), it returns <class 'numpy.ndarray'>. After reading about similar "Module" object is not callable errors, it seems to concern not importing the object itself from within the class, such as "from PCA import PCA". However, I'm already importing PCA from sklean.decomposition.
Here is my data: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
#Load CSV
filename = 'data.csv'
data = pd.read_csv(filename)
df = pd.DataFrame(data)
df=df.dropna(axis=1,how='all')
array = df.values
X = array[:,2:32]
Y = array[:, 1]
#Normalize Data
def normalize(df):
result = df.copy()
for feature_name in df.columns:
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result
df_normalized = normalize(df[ df.columns[2:32]])
pca = PCA(n_components = 16)
pca.fit_transform(df_normalized)
PCAdf = pd.DataFrame(pca.components_, columns = df_normalized.columns, index = ['PC-1','PC-2','PC-3','PC-4','PC-5','PC-6','PC-7','PC-8','PC-9','PC-10','PC-11','PC-12','PC-13','PC-14','PC-15','PC-16'])
PCAarray = PCAdf.values
#Convert all of the "M" class labels as 1, and "B" Labels as 0
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
df_v_y_encoded = encoder.transform(df_v_y)
#Train again, this time using features from principal component analysis.
classifierPCAfeatures = svm.SVC(gamma = "auto", C = 1, kernel = "rbf", decision_function_shape='ovo')
classifierPCAfeatures = fit(PCAdf, encoded_Y)
print(classifierPCAfeatures.score(df_v_x, df_v_y_encoded))

Categories

Resources