python Neurolab - How Can I get parameters of rectangle? - python

Could anyone explain to me how to use this library:
http://code.google.com/p/neurolab/
to create a Neural Network that follows these rules:
A Neural network Neurolab must calculate the parameters of the rectangle
Input:
The lengths of two sides
Output:
Area, perimeter, square or not.
I understand, that I need Feed Forward Multilayer Perceptron (newff).
There is my not working code :
import numpy as np
import matplotlib.pyplot as plt
import neurolab as nl
input_data=np.array
input_data=np.array
([[0.16,0.16 ,1],
[0.1,0.07 ,0],
[0.19,0.1 ,0],
[0.63,0.09 ,0],
[0.04,0.04 ,1],
[0.07,0.03, 0],
[0.05,0.05, 1],
[0.05,0.09,0 ],
[0.08,0.05,0],
[0.03,0.03,1],
])
data = input_data[:, 0:2] #bus duomenis (tinklo iejimai)
labels = input_data[:, 3:] #bus zymos (tinklo isejimai)
dim1_min, dim1_max = data[:,0].min(), data[:,0].max() #pirmo tinklo iejimo minimumas ir maksimumas
dim2_min, dim2_max = data[:,1].min(), data[:,1].max() #antro tinklo iejimo minimumas ir maksimumas
dim3_min, dim3_max = data[:,2].min(), data[:,2].max() #antro tinklo iejimo minimumas ir maksimumas
dim1 = [dim1_min, dim1_max]
dim2 = [dim2_min, dim2_max]
dim3 = [dim3_min, dim3_max]
neural_net = nl.net.newff([[dim1, dim2,dim3]], [3, 2, 4,3])
neural_net.trainf = nl.train.train_gd
error = neural_net.train(data, labels, epochs = 1000, show = 100, goal = 0.01)
print('ivesti Plotas, perimetras, kvadratas ar ne:')
ilgis=input()
plotis=input()
kvadratas =input()
nekvadratas=input()
data_test = [[ilgis,plotis,kvadratas]]
for item in data_test:
atsakymas=neural_net.sim([item])[0] #gauname tinklo atsakyma surasyta i masyva
print(item, '-->', atsakymas)
if atsakymas[0.1] and atsakymas[0.1]:
print("kvadratas")
elif atsakymas[0.6] and atsakymas[0.5]:
print("nekvadratas")
elif atsakymas[0.7] and atsakymas[0.9]:
print("nekvadratas")`

Related

for loop having issue with holt-winters exponential smoothing

when I run my individual models with different training and test data my model works fine. I wanted to run a for loop and now I am getting the error not sure why.
I have created several time splits to check how the model is performing with different data breakdowns.
# dataframe opertations - pandas
import pandas as pd
# plotting data - matplotlib
from matplotlib import pyplot as plt
# time series - statsmodels
# Seasonality decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import seasonal_decompose
# holt winters
# single exponential smoothing
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
# double and triple exponential smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from numpy import sqrt
from sklearn.metrics import mean_squared_error
df = pd.read_csv('/content/hw-cv-imputed.csv',index_col='date', parse_dates=True)
df.index.freq = 'W-FRI'
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[['visits']].plot(title='visit Data')
#Splitting according to the above description
train1, test1 = df.iloc[:52, 0], df.iloc[52:62, 0]
train2, test2 = df.iloc[:56, 0], df.iloc[56:66, 0]
train3, test3 = df.iloc[:60, 0], df.iloc[60:70, 0]
train4, test4 = df.iloc[:65, 0], df.iloc[65:75, 0]
train5, test5 = df.iloc[:69, 0], df.iloc[69:79, 0]
train6, test6 = df.iloc[:73, 0], df.iloc[73:83, 0]
train7, test7 = df.iloc[:78, 0], df.iloc[78:88, 0]
train8, test8 = df.iloc[:82, 0], df.iloc[82:90, 0]
total_model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
# Split into train and test set
#train_df = train1
#test_df = test1
from sklearn.model_selection import ParameterGrid
for train_df ,test_df in [('train1','test1'),('train2','test2'),('train3','test3'),('train4','test4'),('train5','test5'),('train6','test6'),('train7','test7')]:
params_grid = {'trend':('mul','add'),
'seasonal':('mul','add'),
'seasonal_periods': [10,12]}
grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
cnt = cnt+1
print('Total Possible Models',cnt)
model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
for p in grid:
test = pd.DataFrame()
print(p)
**fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()**
test_predictions = fitted_model.forecast(10)
df_new = pd.concat((test_df,test_predictions.rename('predicted_visits'),(((test_df-test_predictions)/test_df)*100).rename('error')),axis=1)
def accuracy(row):
if abs(row['error']) < 20:
return 1
return 0
df_new['accuracy'] = df_new.apply(lambda row: accuracy(row), axis=1)
Total = df_new['accuracy'].sum()
print('Accuracy------------------------------------',Total)
model_parameters = model_parameters.append({'Total':Total,'Parameters':p},ignore_index=True)
parameters = model_parameters.sort_values(by=['Total'],ascending=False)
parameters = parameters.reset_index(drop=True)
parameters.head(9)
Parameters_1 = pd.DataFrame(parameters)
Parameters_1
parameters['Parameters'][0]
total_model_parameters = total_model_parameters.append(parameters)
total_model_parameters
The error is
for the line - *fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()*
ValueError: unrecognized data structures: <class 'str'> / <class 'NoneType'>
Can someone help, please? :)
p.s. The data is as follows
date visits
1/22/2021 7352070
1/29/2021 7063725
2/5/2021 9385950
2/12/2021 7851435
2/19/2021 9509640
2/26/2021 9919170
3/5/2021 9682125
3/12/2021 9597075
3/19/2021 8189835
3/26/2021 7487385
4/2/2021 8863965
4/9/2021 8856165
4/16/2021 8619345
4/23/2021 4499670
4/30/2021 3642705
5/7/2021 3105690
5/14/2021 3096330
5/21/2021 3240360
5/28/2021 5152410
6/4/2021 6471915
6/11/2021 4401030
6/18/2021 3197775
6/25/2021 2606340
7/2/2021 3248460
7/9/2021 4996425
7/16/2021 7775085
7/23/2021 9690795
7/30/2021 10041555
8/6/2021 11849055
8/13/2021 14598750
8/20/2021 15339390
8/27/2021 20118720
9/3/2021 12731115
9/10/2021 17456475
9/17/2021 20393850
9/24/2021 20537895
10/1/2021 20800935
10/8/2021 25035450
10/15/2021 22872450
10/22/2021 22790130
10/29/2021 22036965
11/5/2021 26988975
11/12/2021 29194530
11/19/2021 26106000
11/26/2021 29928660
12/3/2021 29254335
12/10/2021 32165430
12/17/2021 27303570
12/24/2021 21453585
12/31/2021 21568815
1/7/2022 21286680
1/14/2022 25589715
1/21/2022 21890130
1/28/2022 20881515
2/4/2022 24185835
2/11/2022 24160590
2/18/2022 20253360
2/25/2022 20450910
3/4/2022 26542320
3/11/2022 25540335
3/18/2022 29602380
3/25/2022 32258340
4/1/2022 24953640
4/8/2022 22872165
4/15/2022 25784490
4/22/2022 25168356
4/29/2022 25405687
5/6/2022 24693295
5/13/2022 26374944
5/20/2022 26192271
5/27/2022 26868125
6/3/2022 27948287
6/10/2022 28320595
6/17/2022 28153788
6/24/2022 27470327
7/1/2022 30520950
7/8/2022 28635750
7/15/2022 26269140
7/22/2022 24236250
7/29/2022 20541675
8/5/2022 21190020
8/12/2022 22389675
8/19/2022 24496455
8/26/2022 27555645
9/2/2022 26324760
9/9/2022 32937450
9/16/2022 36577425
9/23/2022 33522000
9/30/2022 30759780
10/7/2022 30615870
The problem is that you have ' quoted your variable names so that
for train_df ,test_df in [('train1','test1'),...]
shouldn't have the 's.
You can do away with that line if you're happy to put your pairs of training and test data into a list of tuples like this
import pandas as pd
from sklearn.model_selection import ParameterGrid
from statsmodels.tsa.holtwinters import ExponentialSmoothing
df = pd.read_csv("hw-cv-imputed.csv", index_col="date", parse_dates=True)
df.index.freq = "W-FRI"
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[["visits"]].plot(title="visit Data")
# Splitting according to the above description
train_and_test = []
train_and_test.append((df.iloc[:52, 0], df.iloc[52:62, 0]))
train_and_test.append((df.iloc[:56, 0], df.iloc[56:66, 0]))
train_and_test.append((df.iloc[:60, 0], df.iloc[60:70, 0]))
train_and_test.append((df.iloc[:65, 0], df.iloc[65:75, 0]))
train_and_test.append((df.iloc[:69, 0], df.iloc[69:79, 0]))
train_and_test.append((df.iloc[:73, 0], df.iloc[73:83, 0]))
train_and_test.append((df.iloc[:78, 0], df.iloc[78:88, 0]))
train_and_test.append((df.iloc[:82, 0], df.iloc[82:90, 0]))
total_model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
for train_df, test_df in train_and_test:
params_grid = {
"trend": ("mul", "add"),
"seasonal": ("mul", "add"),
"seasonal_periods": [10, 12],
}
grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
cnt = cnt + 1
print("Total Possible Models", cnt)
model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
for p in grid:
...

why my PCA and PCA from sklearn get different results?

I tried to use the PCA provided in "machine learning in action", but I found that the results obtained by it are not the same as those obtained by the PCA in sklearn. I don't quite understand what is going on.
Below is my code:
import numpy as np
from sklearn.decomposition import PCA
x = np.array([
[1,2,3,4,5, 0],
[0.6,0.7,0.8,0.9,0.10, 0],
[110,120,130,140,150, 0]
])
def my_pca(data, dim):
remove_mean = data - data.mean(axis=0)
cov_data = np.cov(remove_mean, rowvar=0)
eig_val, eig_vec = np.linalg.eig(np.mat(cov_data))
sorted_eig_val = np.argsort(eig_val)
eig_index = sorted_eig_val[:-(dim+1):-1]
transfer = eig_vec[:,eig_index]
low_dim = remove_mean * transfer
return np.array(low_dim, dtype=float)
pca = PCA(n_components = 3)
pca.fit(x)
new_x = pca.transform(x)
print("sklearn")
print(new_x)
new_x = my_pca(x, 3)
print("my")
print(new_x)
Output:
sklearn
[[-9.32494230e+01 1.46120285e+00 2.37676120e-15]
[-9.89004904e+01 -1.43283197e+00 2.98143675e-14]
[ 1.92149913e+02 -2.83708789e-02 2.81307176e-15]]
my
[[ 9.32494230e+01 -1.46120285e+00 7.39333927e-14]
[ 9.89004904e+01 1.43283197e+00 -7.01760428e-14]
[-1.92149913e+02 2.83708789e-02 1.84375626e-14]]
The issue relates to your function, in particular the part where you calculate your eigenvector and eigenvalues:
eig_val, eig_vec = np.linalg.eig(np.mat(cov_data))
It appears that ScitKit learn uses "eigh" instead of "eig", so if you change the code snippet from np.linalg.eig to np.linalg.eigh, you should get the same results.

dbscan not making sense for small amounts of points

I am playing around with a dbscan example in order to see if it will work for me. In my case, I have clusters of a few points (3-5) close together with a fairly long distance in between clusters. I have tried to replicate the situation in the following code. I figured with a low epsilon and low min_samples,this should work, but instead it is telling me that it only sees 1 group (and 20 noise points?). Am I using this incorrectly, or is dbscan not good for this type of problem. I went with dbscan instead of kmeans because I dont know beforehand exactly how many clusters there will be (1-5).
from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
import numpy as np
import matplotlib.pyplot as plt
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
np.save('./clusters.npy', X)
X = np.load('./clusters.npy')
# Compute DBSCAN
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_
no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)
print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)
# Generate scatter plot for training data
colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels)) #only set for 2 colors
plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True)
plt.title('Two clusters with data')
plt.xlabel('Axis X[0]')
plt.ylabel('Axis X[1]')
plt.show()
ended up going with kmeans and doing a modified elbow method:
print(__doc__)
# Author: Phil Roth <mr.phil.roth#gmail.com>
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
random_state = 170
#y_pred = KMeans(n_clusters=5, random_state=random_state).fit_predict(X)
#plt.scatter(X[:, 0], X[:, 1], c=y_pred)
#kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
#maybe I dont have to look for an elbow, just go until the value drops below 1.
#also if I do go too far, it just means that the same shape will be shown twice.
clusterIdx = 0
inertia = 100
while inertia > 1:
clusterIdx = clusterIdx + 1
kmeans = KMeans(n_clusters=clusterIdx, random_state=0).fit(X)
inertia = kmeans.inertia_
print(inertia)
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
print(clusterIdx)
plt.show()

How to loop through multiple sklearn classification models?

I'm trying to figure out how to feed my data set into several scikit classification models.
When I run the code I get the following error:
Traceback (most recent call last):
File "<ipython-input-515-9a3302837c99>", line 3, in <module>
X, y = dataset
ValueError: too many values to unpack (expected 2)
Here is my code.
X = np.asarray([np.asarray(df['LRMScore']),np.asarray(df['Spread'])]).T
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
clustering_names = [
'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
'SpectralClustering', 'Ward', 'AgglomerativeClustering',
'DBSCAN', 'Birch']
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
datasets = [X]
for i_dataset, dataset in enumerate(datasets):
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock", n_clusters=2,
connectivity=connectivity)
birch = cluster.Birch(n_clusters=2)
clustering_algorithms = [
two_means, affinity_propagation, ms, spectral, ward, average_linkage,
dbscan, birch]
for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
# plot
plt.subplot(4, len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
My X variable consists of two columns of a dataframe, and it looks like this.
array([[ 8. , 0.06],
[ 8. , 0.06],
[ 8. , 0.06],
...,
[10. , 0.01],
[ 8. , 0.03],
[ 9.75, 0.06]])
These datasets consist of two arrays: X and Y.
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
My dataset consists of one array. That's the problem. I guess mys setup has to be done slightly differently, but I'm not sure how that would look.
I got the code from the link below.
https://scikit-learn.org/0.18/auto_examples/cluster/plot_cluster_comparison.html
Since your X array has two columns you need to transpose it in order to use value unpacking:
x, y = dataset.T
That did it! Thanks parsa. Here is my final working solution.
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
np.random.seed(0)
pd.set_option('display.max_columns', 500)
df = pd.read_csv('C:\\your_path_here\\test.csv')
print('done!')
df = df[:10000]
df = df.fillna(0)
df = df.dropna()
X = df[['RatingScore',
'Par',
'Term',
'TimeToMaturity',
'LRMScore',
'Coupon',
'Price']]
#select your target variable
y = df[['Spread']]
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
clustering_names = [
'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
'SpectralClustering', 'Ward', 'AgglomerativeClustering',
'DBSCAN', 'Birch']
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock", n_clusters=2,
connectivity=connectivity)
birch = cluster.Birch(n_clusters=2)
clustering_algorithms = [
two_means, affinity_propagation, ms, spectral, ward, average_linkage,
dbscan, birch]
for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
# plot
plt.subplot(4, len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()

Canonical Discriminant Function in Python sklearn

I am learning about Linear Discriminant Analysis and am using the scikit-learn module. I am confused by the "coef_" attribute from the LinearDiscriminantAnalysis class. As far as I understand, these are the discriminant function coefficients (sklearn calls them weight vectors). Since there should be (n_classes-1) discriminant functions, I would expect the coef_ attribute to be an array with shape (n_components, n_features), but instead it prints an (n_classes, n_features) array. Below is an example of this using the Iris dataset example from sklearn. Since there are 3 classes and 2 components, I would expect print(lda.coef_) to give me a 2x4 array instead of a 3x4 array...
Maybe I'm misinterpreting what the weight vectors are, perhaps they are the coefficients for the classification function?
And how do I get the coefficients for each variable in each discriminant/canonical function?
screenshot of jupyter notebook
Code here:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
lda = LinearDiscriminantAnalysis(n_components=2,store_covariance=True)
X_r = lda.fit(X, y).transform(X)
plt.figure()
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.xlabel('Function 1 (%.2f%%)' %(lda.explained_variance_ratio_[0]*100))
plt.ylabel('Function 2 (%.2f%%)' %(lda.explained_variance_ratio_[1]*100))
plt.title('LDA of IRIS dataset')
print(lda.coef_)
#output -> [[ 6.24621637 12.24610757 -16.83743427 -21.13723331]
# [ -1.51666857 -4.36791652 4.64982565 3.18640594]
# [ -4.72954779 -7.87819105 12.18760862 17.95082737]]
You can calculate the coefficients with the following code:
def LDA_coefficients(X,lda):
nb_col = X.shape[1]
matrix= np.zeros((nb_col+1,nb_col), dtype=int)
Z=pd.DataFrame(data=matrix,columns=X.columns)
for j in range(0,nb_col):
Z.iloc[j,j] = 1
LD = lda.transform(Z)
nb_funct= LD.shape[1]
results = pd.DataFrame();
index = ['const']
for j in range(0,LD.shape[0]-1):
index = np.append(index,'C'+str(j+1))
for i in range(0,LD.shape[1]):
coef = [LD[-1][i]]
for j in range(0,LD.shape[0]-1):
coef = np.append(coef,LD[j][i]-LD[-1][i])
result = pd.Series(coef)
result.index = index
column_name = 'LD' + str(i+1)
results[column_name] = result
return results
Before calling this function you need to complete the linear discriminant analysis:
lda = LinearDiscriminantAnalysis()
lda.fit(X,y)

Categories

Resources