for loop having issue with holt-winters exponential smoothing

for loop having issue with holt-winters exponential smoothing - python

when I run my individual models with different training and test data my model works fine. I wanted to run a for loop and now I am getting the error not sure why.
I have created several time splits to check how the model is performing with different data breakdowns.
# dataframe opertations - pandas
import pandas as pd
# plotting data - matplotlib
from matplotlib import pyplot as plt
# time series - statsmodels
# Seasonality decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import seasonal_decompose
# holt winters
# single exponential smoothing
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
# double and triple exponential smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from numpy import sqrt
from sklearn.metrics import mean_squared_error
df = pd.read_csv('/content/hw-cv-imputed.csv',index_col='date', parse_dates=True)
df.index.freq = 'W-FRI'
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[['visits']].plot(title='visit Data')
#Splitting according to the above description
train1, test1 = df.iloc[:52, 0], df.iloc[52:62, 0]
train2, test2 = df.iloc[:56, 0], df.iloc[56:66, 0]
train3, test3 = df.iloc[:60, 0], df.iloc[60:70, 0]
train4, test4 = df.iloc[:65, 0], df.iloc[65:75, 0]
train5, test5 = df.iloc[:69, 0], df.iloc[69:79, 0]
train6, test6 = df.iloc[:73, 0], df.iloc[73:83, 0]
train7, test7 = df.iloc[:78, 0], df.iloc[78:88, 0]
train8, test8 = df.iloc[:82, 0], df.iloc[82:90, 0]
total_model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
# Split into train and test set
#train_df = train1
#test_df = test1
from sklearn.model_selection import ParameterGrid
for train_df ,test_df in [('train1','test1'),('train2','test2'),('train3','test3'),('train4','test4'),('train5','test5'),('train6','test6'),('train7','test7')]:
params_grid = {'trend':('mul','add'),
'seasonal':('mul','add'),
'seasonal_periods': [10,12]}
grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
cnt = cnt+1
print('Total Possible Models',cnt)
model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
for p in grid:
test = pd.DataFrame()
print(p)
**fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()**
test_predictions = fitted_model.forecast(10)
df_new = pd.concat((test_df,test_predictions.rename('predicted_visits'),(((test_df-test_predictions)/test_df)*100).rename('error')),axis=1)
def accuracy(row):
if abs(row['error']) < 20:
return 1
return 0
df_new['accuracy'] = df_new.apply(lambda row: accuracy(row), axis=1)
Total = df_new['accuracy'].sum()
print('Accuracy------------------------------------',Total)
model_parameters = model_parameters.append({'Total':Total,'Parameters':p},ignore_index=True)
parameters = model_parameters.sort_values(by=['Total'],ascending=False)
parameters = parameters.reset_index(drop=True)
parameters.head(9)
Parameters_1 = pd.DataFrame(parameters)
Parameters_1
parameters['Parameters'][0]
total_model_parameters = total_model_parameters.append(parameters)
total_model_parameters
The error is
for the line - *fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()*
ValueError: unrecognized data structures: <class 'str'> / <class 'NoneType'>
Can someone help, please? :)
p.s. The data is as follows
date visits
1/22/2021 7352070
1/29/2021 7063725
2/5/2021 9385950
2/12/2021 7851435
2/19/2021 9509640
2/26/2021 9919170
3/5/2021 9682125
3/12/2021 9597075
3/19/2021 8189835
3/26/2021 7487385
4/2/2021 8863965
4/9/2021 8856165
4/16/2021 8619345
4/23/2021 4499670
4/30/2021 3642705
5/7/2021 3105690
5/14/2021 3096330
5/21/2021 3240360
5/28/2021 5152410
6/4/2021 6471915
6/11/2021 4401030
6/18/2021 3197775
6/25/2021 2606340
7/2/2021 3248460
7/9/2021 4996425
7/16/2021 7775085
7/23/2021 9690795
7/30/2021 10041555
8/6/2021 11849055
8/13/2021 14598750
8/20/2021 15339390
8/27/2021 20118720
9/3/2021 12731115
9/10/2021 17456475
9/17/2021 20393850
9/24/2021 20537895
10/1/2021 20800935
10/8/2021 25035450
10/15/2021 22872450
10/22/2021 22790130
10/29/2021 22036965
11/5/2021 26988975
11/12/2021 29194530
11/19/2021 26106000
11/26/2021 29928660
12/3/2021 29254335
12/10/2021 32165430
12/17/2021 27303570
12/24/2021 21453585
12/31/2021 21568815
1/7/2022 21286680
1/14/2022 25589715
1/21/2022 21890130
1/28/2022 20881515
2/4/2022 24185835
2/11/2022 24160590
2/18/2022 20253360
2/25/2022 20450910
3/4/2022 26542320
3/11/2022 25540335
3/18/2022 29602380
3/25/2022 32258340
4/1/2022 24953640
4/8/2022 22872165
4/15/2022 25784490
4/22/2022 25168356
4/29/2022 25405687
5/6/2022 24693295
5/13/2022 26374944
5/20/2022 26192271
5/27/2022 26868125
6/3/2022 27948287
6/10/2022 28320595
6/17/2022 28153788
6/24/2022 27470327
7/1/2022 30520950
7/8/2022 28635750
7/15/2022 26269140
7/22/2022 24236250
7/29/2022 20541675
8/5/2022 21190020
8/12/2022 22389675
8/19/2022 24496455
8/26/2022 27555645
9/2/2022 26324760
9/9/2022 32937450
9/16/2022 36577425
9/23/2022 33522000
9/30/2022 30759780
10/7/2022 30615870

The problem is that you have ' quoted your variable names so that
for train_df ,test_df in [('train1','test1'),...]
shouldn't have the 's.
You can do away with that line if you're happy to put your pairs of training and test data into a list of tuples like this
import pandas as pd
from sklearn.model_selection import ParameterGrid
from statsmodels.tsa.holtwinters import ExponentialSmoothing
df = pd.read_csv("hw-cv-imputed.csv", index_col="date", parse_dates=True)
df.index.freq = "W-FRI"
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[["visits"]].plot(title="visit Data")
# Splitting according to the above description
train_and_test = []
train_and_test.append((df.iloc[:52, 0], df.iloc[52:62, 0]))
train_and_test.append((df.iloc[:56, 0], df.iloc[56:66, 0]))
train_and_test.append((df.iloc[:60, 0], df.iloc[60:70, 0]))
train_and_test.append((df.iloc[:65, 0], df.iloc[65:75, 0]))
train_and_test.append((df.iloc[:69, 0], df.iloc[69:79, 0]))
train_and_test.append((df.iloc[:73, 0], df.iloc[73:83, 0]))
train_and_test.append((df.iloc[:78, 0], df.iloc[78:88, 0]))
train_and_test.append((df.iloc[:82, 0], df.iloc[82:90, 0]))
total_model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
for train_df, test_df in train_and_test:
params_grid = {
"trend": ("mul", "add"),
"seasonal": ("mul", "add"),
"seasonal_periods": [10, 12],
}
grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
cnt = cnt + 1
print("Total Possible Models", cnt)
model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
for p in grid:
...

Related

python Neurolab - How Can I get parameters of rectangle?

Could anyone explain to me how to use this library:
http://code.google.com/p/neurolab/
to create a Neural Network that follows these rules:
A Neural network Neurolab must calculate the parameters of the rectangle
Input:
The lengths of two sides
Output:
Area, perimeter, square or not.
I understand, that I need Feed Forward Multilayer Perceptron (newff).
There is my not working code :
import numpy as np
import matplotlib.pyplot as plt
import neurolab as nl
input_data=np.array
input_data=np.array
([[0.16,0.16 ,1],
[0.1,0.07 ,0],
[0.19,0.1 ,0],
[0.63,0.09 ,0],
[0.04,0.04 ,1],
[0.07,0.03, 0],
[0.05,0.05, 1],
[0.05,0.09,0 ],
[0.08,0.05,0],
[0.03,0.03,1],
])
data = input_data[:, 0:2] #bus duomenis (tinklo iejimai)
labels = input_data[:, 3:] #bus zymos (tinklo isejimai)
dim1_min, dim1_max = data[:,0].min(), data[:,0].max() #pirmo tinklo iejimo minimumas ir maksimumas
dim2_min, dim2_max = data[:,1].min(), data[:,1].max() #antro tinklo iejimo minimumas ir maksimumas
dim3_min, dim3_max = data[:,2].min(), data[:,2].max() #antro tinklo iejimo minimumas ir maksimumas
dim1 = [dim1_min, dim1_max]
dim2 = [dim2_min, dim2_max]
dim3 = [dim3_min, dim3_max]
neural_net = nl.net.newff([[dim1, dim2,dim3]], [3, 2, 4,3])
neural_net.trainf = nl.train.train_gd
error = neural_net.train(data, labels, epochs = 1000, show = 100, goal = 0.01)
print('ivesti Plotas, perimetras, kvadratas ar ne:')
ilgis=input()
plotis=input()
kvadratas =input()
nekvadratas=input()
data_test = [[ilgis,plotis,kvadratas]]
for item in data_test:
atsakymas=neural_net.sim([item])[0] #gauname tinklo atsakyma surasyta i masyva
print(item, '-->', atsakymas)
if atsakymas[0.1] and atsakymas[0.1]:
print("kvadratas")
elif atsakymas[0.6] and atsakymas[0.5]:
print("nekvadratas")
elif atsakymas[0.7] and atsakymas[0.9]:
print("nekvadratas")`

ARIMA model in Python

I am using ARIMA to do forecasting in Python, following are my code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
HSBC = pd.read_csv('HSBC.csv', index_col = 'Date', parse_dates = True)
HSBC2 = HSBC['Close']
result = seasonal_decompose(HSBC2, model='multiplicative', period = 1)
from pmdarima import auto_arima
import warnings
warnings.filterwarnings("ignore")
stepwise_fit = auto_arima(HSBC2, start_p = 1, start_q = 1,
max_p = 3, max_q = 3, m = 12,
start_P = 0, seasonal = True,
d = None, D = 1, trace = True,
error_action ='ignore',
suppress_warnings = True,
stepwise = True)
train = HSBC2[0:173]
test = HSBC2[173:248]
model = SARIMAX(train, order = (0, 1, 1), seasonal_order =(0,1,1,12))
result = model.fit()
start = len(train)
end = len(train) + len(test) - 1
prediction = result.predict(start,end,
typ = 'levels').rename("Predictions")
predictions.plot(legend = True)
test.plot(legend = True)
I am confusing that why the x-axis of prediction plot become number, which supposed to be date like that of test plot.

If I am not wrong, this is due you have not specify the frequency of your index. Try this:
HSBC.index = pd.date_range(freq='d', start=HSBC.index[0], periods=len(HSBC)
Beware that you should frequency='d' if your index is daily spaced
EDIT:
So, the answer was just changing the parameters start and end parameters of the predict method, e.g:
start = test['Date'].iloc[0]
end = test['Date'].iloc[-1]
prediction = result.predict(start,end,
typ = 'levels').rename("Predictions")

How to loop through multiple sklearn classification models?

I'm trying to figure out how to feed my data set into several scikit classification models.
When I run the code I get the following error:
Traceback (most recent call last):
File "<ipython-input-515-9a3302837c99>", line 3, in <module>
X, y = dataset
ValueError: too many values to unpack (expected 2)
Here is my code.
X = np.asarray([np.asarray(df['LRMScore']),np.asarray(df['Spread'])]).T
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
clustering_names = [
'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
'SpectralClustering', 'Ward', 'AgglomerativeClustering',
'DBSCAN', 'Birch']
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
datasets = [X]
for i_dataset, dataset in enumerate(datasets):
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock", n_clusters=2,
connectivity=connectivity)
birch = cluster.Birch(n_clusters=2)
clustering_algorithms = [
two_means, affinity_propagation, ms, spectral, ward, average_linkage,
dbscan, birch]
for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
# plot
plt.subplot(4, len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
My X variable consists of two columns of a dataframe, and it looks like this.
array([[ 8. , 0.06],
[ 8. , 0.06],
[ 8. , 0.06],
...,
[10. , 0.01],
[ 8. , 0.03],
[ 9.75, 0.06]])
These datasets consist of two arrays: X and Y.
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
My dataset consists of one array. That's the problem. I guess mys setup has to be done slightly differently, but I'm not sure how that would look.
I got the code from the link below.
https://scikit-learn.org/0.18/auto_examples/cluster/plot_cluster_comparison.html

Since your X array has two columns you need to transpose it in order to use value unpacking:
x, y = dataset.T

That did it! Thanks parsa. Here is my final working solution.
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
np.random.seed(0)
pd.set_option('display.max_columns', 500)
df = pd.read_csv('C:\\your_path_here\\test.csv')
print('done!')
df = df[:10000]
df = df.fillna(0)
df = df.dropna()
X = df[['RatingScore',
'Par',
'Term',
'TimeToMaturity',
'LRMScore',
'Coupon',
'Price']]
#select your target variable
y = df[['Spread']]
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
clustering_names = [
'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
'SpectralClustering', 'Ward', 'AgglomerativeClustering',
'DBSCAN', 'Birch']
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock", n_clusters=2,
connectivity=connectivity)
birch = cluster.Birch(n_clusters=2)
clustering_algorithms = [
two_means, affinity_propagation, ms, spectral, ward, average_linkage,
dbscan, birch]
for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
# plot
plt.subplot(4, len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()

IndexError when ploting sklearn manifold TSNE

I try to run a t-sne but python shows me this error:
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices
Data is being provided by this link.
Here's the code:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
#scatter plot the sample points among 5 classes
markers=('s','d','o','^','v')
color_map = {0:'red', 1:'blue', 2:'lightgreen', 3:'purple', 4:'cyan'}
plt.figure()
for idx, cl in enumerate(np.unique(x_test_2d)):
plt.scatter(x=x_test_2d[cl, 0],y =x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx], label=cl)
plt.show()
What do I have to change in order to make this work?

The error is due to the following line:
plt.scatter(x_test_2d[cl, 0], x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx])
Here, cl can take and takes not integer values (from np.unique(x_test_2d)) and this raises the error, e.g. the last value that cl takes is 99.46295 and then you use: x_test_2d[cl, 0] which translates into x_test_2d[99.46295, 0]
Define a variable y that hold the class labels, then use:
# variable holding the classes
y = dataframe_all.classe.values
y = np.array([ord(i) for i in y])
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()
FULL CODE:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
# variable holding the classes
y = dataframe_all.classe.values # you need this for the colors
y = np.array([ord(i) for i in y]) # convert letters to numbers
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()

Getting the variance of each column in pandas

I want to calculate the variance of features saved in a Train and Test file a followed :
col1 Feature0 Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 Feature7 Feature8 Feature9
col2 26658 40253.5 3.22115e+09 0.0277727 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
col3 2658 4053.5 3.25e+09 0.0277 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
....
for that I've wrote the following :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
sel = VarianceThreshold(threshold=(.2 * (1 - .2)))
sel.fit_transform(np_scaled_train)
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
This obviously doesn't work. the result in variance_result.csv is just the train matrix normalized.
So my question how can I get the index of the columns(features) that have a variance bellow 20%.
thanks in advance !
Update
I've solved the variance issue this way :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.feature_selection import VarianceThreshold
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
variance =pd_scaled_train.apply(np.var,axis=0)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
temp_df = pd.DataFrame(variance.values,Training_Frame.columns.values[:-2])
temp_df.T.to_csv('Training_features_variance.csv',index=False)
No I still don't know how to get indeces of features with a variance say bigger than 0.2 from variance other thanks running a loop!

Just set the threshold to 0.0 and then use the variances_ attribute of the VarianceThreshold object to get the variances of all your features, then you can identify which of them have lower variance.
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_
#Output: array([ 0. , 0.22222222, 2.88888889, 0. ])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

for loop having issue with holt-winters exponential smoothing - python

Related

python Neurolab - How Can I get parameters of rectangle?

ARIMA model in Python

How to loop through multiple sklearn classification models?

IndexError when ploting sklearn manifold TSNE

Getting the variance of each column in pandas

Categories

Resources