Could anyone explain to me how to use this library:
http://code.google.com/p/neurolab/
to create a Neural Network that follows these rules:
A Neural network Neurolab must calculate the parameters of the rectangle
Input:
The lengths of two sides
Output:
Area, perimeter, square or not.
I understand, that I need Feed Forward Multilayer Perceptron (newff).
There is my not working code :
import numpy as np
import matplotlib.pyplot as plt
import neurolab as nl
input_data=np.array
input_data=np.array
([[0.16,0.16 ,1],
[0.1,0.07 ,0],
[0.19,0.1 ,0],
[0.63,0.09 ,0],
[0.04,0.04 ,1],
[0.07,0.03, 0],
[0.05,0.05, 1],
[0.05,0.09,0 ],
[0.08,0.05,0],
[0.03,0.03,1],
])
data = input_data[:, 0:2] #bus duomenis (tinklo iejimai)
labels = input_data[:, 3:] #bus zymos (tinklo isejimai)
dim1_min, dim1_max = data[:,0].min(), data[:,0].max() #pirmo tinklo iejimo minimumas ir maksimumas
dim2_min, dim2_max = data[:,1].min(), data[:,1].max() #antro tinklo iejimo minimumas ir maksimumas
dim3_min, dim3_max = data[:,2].min(), data[:,2].max() #antro tinklo iejimo minimumas ir maksimumas
dim1 = [dim1_min, dim1_max]
dim2 = [dim2_min, dim2_max]
dim3 = [dim3_min, dim3_max]
neural_net = nl.net.newff([[dim1, dim2,dim3]], [3, 2, 4,3])
neural_net.trainf = nl.train.train_gd
error = neural_net.train(data, labels, epochs = 1000, show = 100, goal = 0.01)
print('ivesti Plotas, perimetras, kvadratas ar ne:')
ilgis=input()
plotis=input()
kvadratas =input()
nekvadratas=input()
data_test = [[ilgis,plotis,kvadratas]]
for item in data_test:
atsakymas=neural_net.sim([item])[0] #gauname tinklo atsakyma surasyta i masyva
print(item, '-->', atsakymas)
if atsakymas[0.1] and atsakymas[0.1]:
print("kvadratas")
elif atsakymas[0.6] and atsakymas[0.5]:
print("nekvadratas")
elif atsakymas[0.7] and atsakymas[0.9]:
print("nekvadratas")`
Related
when I run my individual models with different training and test data my model works fine. I wanted to run a for loop and now I am getting the error not sure why.
I have created several time splits to check how the model is performing with different data breakdowns.
# dataframe opertations - pandas
import pandas as pd
# plotting data - matplotlib
from matplotlib import pyplot as plt
# time series - statsmodels
# Seasonality decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import seasonal_decompose
# holt winters
# single exponential smoothing
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
# double and triple exponential smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from numpy import sqrt
from sklearn.metrics import mean_squared_error
df = pd.read_csv('/content/hw-cv-imputed.csv',index_col='date', parse_dates=True)
df.index.freq = 'W-FRI'
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[['visits']].plot(title='visit Data')
#Splitting according to the above description
train1, test1 = df.iloc[:52, 0], df.iloc[52:62, 0]
train2, test2 = df.iloc[:56, 0], df.iloc[56:66, 0]
train3, test3 = df.iloc[:60, 0], df.iloc[60:70, 0]
train4, test4 = df.iloc[:65, 0], df.iloc[65:75, 0]
train5, test5 = df.iloc[:69, 0], df.iloc[69:79, 0]
train6, test6 = df.iloc[:73, 0], df.iloc[73:83, 0]
train7, test7 = df.iloc[:78, 0], df.iloc[78:88, 0]
train8, test8 = df.iloc[:82, 0], df.iloc[82:90, 0]
total_model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
# Split into train and test set
#train_df = train1
#test_df = test1
from sklearn.model_selection import ParameterGrid
for train_df ,test_df in [('train1','test1'),('train2','test2'),('train3','test3'),('train4','test4'),('train5','test5'),('train6','test6'),('train7','test7')]:
params_grid = {'trend':('mul','add'),
'seasonal':('mul','add'),
'seasonal_periods': [10,12]}
grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
cnt = cnt+1
print('Total Possible Models',cnt)
model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
for p in grid:
test = pd.DataFrame()
print(p)
**fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()**
test_predictions = fitted_model.forecast(10)
df_new = pd.concat((test_df,test_predictions.rename('predicted_visits'),(((test_df-test_predictions)/test_df)*100).rename('error')),axis=1)
def accuracy(row):
if abs(row['error']) < 20:
return 1
return 0
df_new['accuracy'] = df_new.apply(lambda row: accuracy(row), axis=1)
Total = df_new['accuracy'].sum()
print('Accuracy------------------------------------',Total)
model_parameters = model_parameters.append({'Total':Total,'Parameters':p},ignore_index=True)
parameters = model_parameters.sort_values(by=['Total'],ascending=False)
parameters = parameters.reset_index(drop=True)
parameters.head(9)
Parameters_1 = pd.DataFrame(parameters)
Parameters_1
parameters['Parameters'][0]
total_model_parameters = total_model_parameters.append(parameters)
total_model_parameters
The error is
for the line - *fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()*
ValueError: unrecognized data structures: <class 'str'> / <class 'NoneType'>
Can someone help, please? :)
p.s. The data is as follows
date visits
1/22/2021 7352070
1/29/2021 7063725
2/5/2021 9385950
2/12/2021 7851435
2/19/2021 9509640
2/26/2021 9919170
3/5/2021 9682125
3/12/2021 9597075
3/19/2021 8189835
3/26/2021 7487385
4/2/2021 8863965
4/9/2021 8856165
4/16/2021 8619345
4/23/2021 4499670
4/30/2021 3642705
5/7/2021 3105690
5/14/2021 3096330
5/21/2021 3240360
5/28/2021 5152410
6/4/2021 6471915
6/11/2021 4401030
6/18/2021 3197775
6/25/2021 2606340
7/2/2021 3248460
7/9/2021 4996425
7/16/2021 7775085
7/23/2021 9690795
7/30/2021 10041555
8/6/2021 11849055
8/13/2021 14598750
8/20/2021 15339390
8/27/2021 20118720
9/3/2021 12731115
9/10/2021 17456475
9/17/2021 20393850
9/24/2021 20537895
10/1/2021 20800935
10/8/2021 25035450
10/15/2021 22872450
10/22/2021 22790130
10/29/2021 22036965
11/5/2021 26988975
11/12/2021 29194530
11/19/2021 26106000
11/26/2021 29928660
12/3/2021 29254335
12/10/2021 32165430
12/17/2021 27303570
12/24/2021 21453585
12/31/2021 21568815
1/7/2022 21286680
1/14/2022 25589715
1/21/2022 21890130
1/28/2022 20881515
2/4/2022 24185835
2/11/2022 24160590
2/18/2022 20253360
2/25/2022 20450910
3/4/2022 26542320
3/11/2022 25540335
3/18/2022 29602380
3/25/2022 32258340
4/1/2022 24953640
4/8/2022 22872165
4/15/2022 25784490
4/22/2022 25168356
4/29/2022 25405687
5/6/2022 24693295
5/13/2022 26374944
5/20/2022 26192271
5/27/2022 26868125
6/3/2022 27948287
6/10/2022 28320595
6/17/2022 28153788
6/24/2022 27470327
7/1/2022 30520950
7/8/2022 28635750
7/15/2022 26269140
7/22/2022 24236250
7/29/2022 20541675
8/5/2022 21190020
8/12/2022 22389675
8/19/2022 24496455
8/26/2022 27555645
9/2/2022 26324760
9/9/2022 32937450
9/16/2022 36577425
9/23/2022 33522000
9/30/2022 30759780
10/7/2022 30615870
The problem is that you have ' quoted your variable names so that
for train_df ,test_df in [('train1','test1'),...]
shouldn't have the 's.
You can do away with that line if you're happy to put your pairs of training and test data into a list of tuples like this
import pandas as pd
from sklearn.model_selection import ParameterGrid
from statsmodels.tsa.holtwinters import ExponentialSmoothing
df = pd.read_csv("hw-cv-imputed.csv", index_col="date", parse_dates=True)
df.index.freq = "W-FRI"
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[["visits"]].plot(title="visit Data")
# Splitting according to the above description
train_and_test = []
train_and_test.append((df.iloc[:52, 0], df.iloc[52:62, 0]))
train_and_test.append((df.iloc[:56, 0], df.iloc[56:66, 0]))
train_and_test.append((df.iloc[:60, 0], df.iloc[60:70, 0]))
train_and_test.append((df.iloc[:65, 0], df.iloc[65:75, 0]))
train_and_test.append((df.iloc[:69, 0], df.iloc[69:79, 0]))
train_and_test.append((df.iloc[:73, 0], df.iloc[73:83, 0]))
train_and_test.append((df.iloc[:78, 0], df.iloc[78:88, 0]))
train_and_test.append((df.iloc[:82, 0], df.iloc[82:90, 0]))
total_model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
for train_df, test_df in train_and_test:
params_grid = {
"trend": ("mul", "add"),
"seasonal": ("mul", "add"),
"seasonal_periods": [10, 12],
}
grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
cnt = cnt + 1
print("Total Possible Models", cnt)
model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
for p in grid:
...
I tried to use the PCA provided in "machine learning in action", but I found that the results obtained by it are not the same as those obtained by the PCA in sklearn. I don't quite understand what is going on.
Below is my code:
import numpy as np
from sklearn.decomposition import PCA
x = np.array([
[1,2,3,4,5, 0],
[0.6,0.7,0.8,0.9,0.10, 0],
[110,120,130,140,150, 0]
])
def my_pca(data, dim):
remove_mean = data - data.mean(axis=0)
cov_data = np.cov(remove_mean, rowvar=0)
eig_val, eig_vec = np.linalg.eig(np.mat(cov_data))
sorted_eig_val = np.argsort(eig_val)
eig_index = sorted_eig_val[:-(dim+1):-1]
transfer = eig_vec[:,eig_index]
low_dim = remove_mean * transfer
return np.array(low_dim, dtype=float)
pca = PCA(n_components = 3)
pca.fit(x)
new_x = pca.transform(x)
print("sklearn")
print(new_x)
new_x = my_pca(x, 3)
print("my")
print(new_x)
Output:
sklearn
[[-9.32494230e+01 1.46120285e+00 2.37676120e-15]
[-9.89004904e+01 -1.43283197e+00 2.98143675e-14]
[ 1.92149913e+02 -2.83708789e-02 2.81307176e-15]]
my
[[ 9.32494230e+01 -1.46120285e+00 7.39333927e-14]
[ 9.89004904e+01 1.43283197e+00 -7.01760428e-14]
[-1.92149913e+02 2.83708789e-02 1.84375626e-14]]
The issue relates to your function, in particular the part where you calculate your eigenvector and eigenvalues:
eig_val, eig_vec = np.linalg.eig(np.mat(cov_data))
It appears that ScitKit learn uses "eigh" instead of "eig", so if you change the code snippet from np.linalg.eig to np.linalg.eigh, you should get the same results.
I am playing around with a dbscan example in order to see if it will work for me. In my case, I have clusters of a few points (3-5) close together with a fairly long distance in between clusters. I have tried to replicate the situation in the following code. I figured with a low epsilon and low min_samples,this should work, but instead it is telling me that it only sees 1 group (and 20 noise points?). Am I using this incorrectly, or is dbscan not good for this type of problem. I went with dbscan instead of kmeans because I dont know beforehand exactly how many clusters there will be (1-5).
from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
import numpy as np
import matplotlib.pyplot as plt
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
np.save('./clusters.npy', X)
X = np.load('./clusters.npy')
# Compute DBSCAN
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_
no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)
print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)
# Generate scatter plot for training data
colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels)) #only set for 2 colors
plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True)
plt.title('Two clusters with data')
plt.xlabel('Axis X[0]')
plt.ylabel('Axis X[1]')
plt.show()
ended up going with kmeans and doing a modified elbow method:
print(__doc__)
# Author: Phil Roth <mr.phil.roth#gmail.com>
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2
# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
random_state = 170
#y_pred = KMeans(n_clusters=5, random_state=random_state).fit_predict(X)
#plt.scatter(X[:, 0], X[:, 1], c=y_pred)
#kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
#maybe I dont have to look for an elbow, just go until the value drops below 1.
#also if I do go too far, it just means that the same shape will be shown twice.
clusterIdx = 0
inertia = 100
while inertia > 1:
clusterIdx = clusterIdx + 1
kmeans = KMeans(n_clusters=clusterIdx, random_state=0).fit(X)
inertia = kmeans.inertia_
print(inertia)
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
print(clusterIdx)
plt.show()
I'm trying to figure out how to feed my data set into several scikit classification models.
When I run the code I get the following error:
Traceback (most recent call last):
File "<ipython-input-515-9a3302837c99>", line 3, in <module>
X, y = dataset
ValueError: too many values to unpack (expected 2)
Here is my code.
X = np.asarray([np.asarray(df['LRMScore']),np.asarray(df['Spread'])]).T
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
clustering_names = [
'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
'SpectralClustering', 'Ward', 'AgglomerativeClustering',
'DBSCAN', 'Birch']
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
datasets = [X]
for i_dataset, dataset in enumerate(datasets):
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock", n_clusters=2,
connectivity=connectivity)
birch = cluster.Birch(n_clusters=2)
clustering_algorithms = [
two_means, affinity_propagation, ms, spectral, ward, average_linkage,
dbscan, birch]
for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
# plot
plt.subplot(4, len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
My X variable consists of two columns of a dataframe, and it looks like this.
array([[ 8. , 0.06],
[ 8. , 0.06],
[ 8. , 0.06],
...,
[10. , 0.01],
[ 8. , 0.03],
[ 9.75, 0.06]])
These datasets consist of two arrays: X and Y.
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
My dataset consists of one array. That's the problem. I guess mys setup has to be done slightly differently, but I'm not sure how that would look.
I got the code from the link below.
https://scikit-learn.org/0.18/auto_examples/cluster/plot_cluster_comparison.html
Since your X array has two columns you need to transpose it in order to use value unpacking:
x, y = dataset.T
That did it! Thanks parsa. Here is my final working solution.
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
np.random.seed(0)
pd.set_option('display.max_columns', 500)
df = pd.read_csv('C:\\your_path_here\\test.csv')
print('done!')
df = df[:10000]
df = df.fillna(0)
df = df.dropna()
X = df[['RatingScore',
'Par',
'Term',
'TimeToMaturity',
'LRMScore',
'Coupon',
'Price']]
#select your target variable
y = df[['Spread']]
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
clustering_names = [
'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
'SpectralClustering', 'Ward', 'AgglomerativeClustering',
'DBSCAN', 'Birch']
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# create clustering estimators
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=.2)
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock", n_clusters=2,
connectivity=connectivity)
birch = cluster.Birch(n_clusters=2)
clustering_algorithms = [
two_means, affinity_propagation, ms, spectral, ward, average_linkage,
dbscan, birch]
for name, algorithm in zip(clustering_names, clustering_algorithms):
# predict cluster memberships
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
# plot
plt.subplot(4, len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(algorithm, 'cluster_centers_'):
centers = algorithm.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
I am learning about Linear Discriminant Analysis and am using the scikit-learn module. I am confused by the "coef_" attribute from the LinearDiscriminantAnalysis class. As far as I understand, these are the discriminant function coefficients (sklearn calls them weight vectors). Since there should be (n_classes-1) discriminant functions, I would expect the coef_ attribute to be an array with shape (n_components, n_features), but instead it prints an (n_classes, n_features) array. Below is an example of this using the Iris dataset example from sklearn. Since there are 3 classes and 2 components, I would expect print(lda.coef_) to give me a 2x4 array instead of a 3x4 array...
Maybe I'm misinterpreting what the weight vectors are, perhaps they are the coefficients for the classification function?
And how do I get the coefficients for each variable in each discriminant/canonical function?
screenshot of jupyter notebook
Code here:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
lda = LinearDiscriminantAnalysis(n_components=2,store_covariance=True)
X_r = lda.fit(X, y).transform(X)
plt.figure()
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.xlabel('Function 1 (%.2f%%)' %(lda.explained_variance_ratio_[0]*100))
plt.ylabel('Function 2 (%.2f%%)' %(lda.explained_variance_ratio_[1]*100))
plt.title('LDA of IRIS dataset')
print(lda.coef_)
#output -> [[ 6.24621637 12.24610757 -16.83743427 -21.13723331]
# [ -1.51666857 -4.36791652 4.64982565 3.18640594]
# [ -4.72954779 -7.87819105 12.18760862 17.95082737]]
You can calculate the coefficients with the following code:
def LDA_coefficients(X,lda):
nb_col = X.shape[1]
matrix= np.zeros((nb_col+1,nb_col), dtype=int)
Z=pd.DataFrame(data=matrix,columns=X.columns)
for j in range(0,nb_col):
Z.iloc[j,j] = 1
LD = lda.transform(Z)
nb_funct= LD.shape[1]
results = pd.DataFrame();
index = ['const']
for j in range(0,LD.shape[0]-1):
index = np.append(index,'C'+str(j+1))
for i in range(0,LD.shape[1]):
coef = [LD[-1][i]]
for j in range(0,LD.shape[0]-1):
coef = np.append(coef,LD[j][i]-LD[-1][i])
result = pd.Series(coef)
result.index = index
column_name = 'LD' + str(i+1)
results[column_name] = result
return results
Before calling this function you need to complete the linear discriminant analysis:
lda = LinearDiscriminantAnalysis()
lda.fit(X,y)