Related
I attempted to plot the kernel density distribution (Gaussian) curve along with the histogram plot of two data set in python.
However, in my script the estimation of 95% (data1: marked by red color vertical line) and 5% (data2: marked by black color vertical line) is very time-consuming, e.g. I need to test different limits [detail explanation in code, where I need to change the upper limited] to get the 95% and 5% probability of the kernel density curve.
May someone help out me here and suggest possible way out fixed this issue or another approach to plot the kernel density curve along with its 95% and 5% probability.
Thank you!
My script is here.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = result['95_24'] # data 1
data2 = result['5_24'] # data 2
def plot_prob_density(data1, data2, x_start1, x_end1):
fig, (ax1) = plt.subplots(1, 1, figsize=(6,5), sharey=False)
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Hisogram plot of data
ax1.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
ax1.hist(data2, bins=np.linspace(-20,20,40), density=True, color='k', alpha=0.4)
# kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
kd_data2 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data2)
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
kd_vals_data2 = np.exp(kd_data2.score_samples(x))
# density plot
ax1.plot(x, kd_vals_data1, color='r', label='$Na$', linewidth=2)
ax1.plot(x, kd_vals_data2, color='k', label='$Λ$', linewidth = 2)
# using the function get probability)
ax1.axvline(x=x_end1,color='red',linestyle='dashed', linewidth = 3, label='$β_{95\%}$')
ax1.axvline(x=x_start1,color='k',linestyle='dashed', linewidth = 3, label='$β_{5\%}$')
# Show the plots
ax1.set_ylabel('Probability density', fontsize=12)
ax1.set_xlabel('Beta', fontsize=12)
ax1.set_xlim([-20, 20])
ax1.set_ylim(0, 0.3)
ax1.set_yticks([0, 0.1, 0.2, 0.3])
ax1.set_xticks([-20, 20, -10, 10, 0])
ax1.legend(fontsize=12, loc='upper left', frameon=False)
fig.tight_layout()
gc.collect()
return kd_data1, kd_data2,
# Calculation of 95% and 5 % for data1 and data2 Kernel density curve
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
data2 = np.array(data2).reshape(-1, 1)
kd_data1, kd_data2= plot_prob_density(data1, data2, x_start1=-2.2, x_end1=5.3)
# ##############################
print('Beta-95%: {}'
.format(get_probability(start_value = -20,
end_value = 5.3,
eval_points = 1000,
kd = kd_data1)))
# here, I modify the end-value every time and then see teh output #value, when it reached to 95% then i took taht values as 95% #confidence, however this is very confsuing, i want to compute this 95% directly and same for 5% probbaility, computed below:
print('Beta-5%: {}\n'
.format(get_probability(start_value = -20,
end_value = -2.2,
eval_points = 1000,
kd = kd_data2)))
####################################################################
plt.savefig("Ev_test.png")
The pictorial representation is also attached here.
Histogram and kernel density plot along with its 95% and 5% probability limits highlighted with red and black vertical bold lines:
Here is the possible way out to fix this issue. Additionally, the proposed method it has error in percentile calculation, therefore i recommend not to use that:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
%matplotlib inline
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs
from scipy.stats import norm
import numpy as np
fig = plt.figure(figsize=(4, 4), dpi=300)
ax = fig.add_subplot(111)
# Plot the histogram
ax.hist(data8,bins=20,zorder=1,color="r",density=True,alpha=0.6,)
ax.hist(data7,bins=20,zorder=1,color="black",density=True,alpha=0.6,)
# kde.fit()
kde = sm.nonparametric.KDEUnivariate(data8)
kde1 = sm.nonparametric.KDEUnivariate(data7)
# Plot the KDE for various bandwidths
for bandwidth in [1.8]:
kde.fit(bw=bandwidth)
kde1.fit(bw=bandwidth)# Estimate the densities
ax.plot(kde.support, kde.density,"-",lw=2,color="r",zorder=10, alpha=0.6, label="Data1")
ax.plot(kde1.support, kde1.density,"-",lw=2,color="black",zorder=10, alpha=0.6, label="Data2")
ax.legend(loc="best")
ax.set_xlim([-20, 40])
ax.set_ylim([0, 0.3])
ax.grid(False)
# Probabilities calculation
quantiles_mesh = np.linspace(0,1,len(kde.density))
fig = plt.figure(figsize=(2, 2), dpi=300)
plt.plot(quantiles_mesh, kde.icdf)
data_1_95= np.percentile(kde1.icdf, 95)
data_2_5= np.percentile(kde2.icdf, 5)
ax.axvline(x=data_1_95,color='red',linestyle='dashed', linewidth = 2)
ax.axvline(x=data_2_5,color='k',linestyle='dashed', linewidth = 2)
#plt.savefig("KDE_Plot.png")
Let me say I have a data of shape (100, 580, 10) where 100 is the different samples with 580 points and 10 features. I would like to use kmeans to find the silhouette score for all the 10 features across the 100 data samples and plot the silhouette score and the number of clusters.
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# shape of feature vector
feature_vector = (100,580,10)
nr_clusters = range(2,10)
silhouette_avg = {}
# loop throth the data samples in the feature vector
for data_sample in range(len(feature_vector)):
feature_data = feature_vector[data_sample]
print(f'Here1'+ str(feature_data.shape))
for feature in range(len(feature_vector[0])):
silhouette_avg[spectra, feature] = []
for num_clusters in nr_clusters:
kmeans = KMeans(n_clusters=num_clusters, max_iter=50)labels = kmeans.labels_
silhouette_avg[data_sample, feature].append(silhouette_score(feature_data, labels))
print(f'Silhouette score for data sample {data_sample} and feature {feature} with {num_clusters} clusters is {silhouette_score(feature_data, labels)}')
plt.plot(nr_clusters, silhouette_avg[data_sample, feature], label=f'Data sample {data_sample}')
plt.ylabel('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.title('Silhouette Score for each feature')
plt.legend()
I tried that but that rather gives me the silhouette score per data sample rather than the silhouette scores of all the 10 features across the 100 data samples.
I have converted my feature_vector to a pandas dataframe and then use that dataframe to loop through the number of columns (as feature vectors) and assign those in a dictionary.
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# shape of feature vector
feature_vector = (100,580,10)
nr_clusters = range(2,10)
silhouette_avg = {}
# loop throth the data samples in the feature vector
for feature in feature_vector.columns:
silhouette_avg[feature] = []
feature_data = feature_vector[feature].values.reshape(-1, 1)
for num_clusters in nr_clusters:
kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
labels = kmeans.labels_
silhouette_avg[feature].append(silhouette_score(feature_data,labels))
print(f'Silhouette score for feature {feature} with {num_clusters} clusters is {silhouette_score(feature_data, labels)}')
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(nr_clusters, silhouette_avg['feature_vec1'],linestyle='-')
ax.plot(nr_clusters, silhouette_avg['feature_vec2'], linestyle='--')
ax.plot(nr_clusters, silhouette_avg['feature_vec3'], linestyle=':')
ax.plot(nr_clusters, silhouette_avg['feature_vec4'], linestyle='-.')
ax.plot(nr_clusters, silhouette_avg['feature_vec5'], linestyle=(0, (1, 10)))
ax.plot(nr_clusters, silhouette_avg['feature_vec6'], linestyle=(0, (1, 1)))
ax.plot(nr_clusters, silhouette_avg['feature_vec7'], linestyle=(0, (5, 10)))
ax.plot(nr_clusters, silhouette_avg['feature_vec8'], linestyle=(0, (5, 1)))
ax.plot(nr_clusters, silhouette_avg['feature_vec9'], linestyle=(0, (3, 10, 1, 10)))
ax.plot(nr_clusters, silhouette_avg['feature_vec10'], linestyle=(0, (3, 1, 1, 1)))
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Silhouette score')
ax.set_title('Silhouette score for each feature')
ax.legend(['feature_vec1', 'feature_vec2', 'feature_vec3', 'feature_vec4', 'feature_vec5', 'feature_vec6', 'feature_vec7', 'feature_vec8', 'feature_vec9', 'feature_vec10'], loc='best')
plt.legend()
plt.show()
I am using the silhouette_score metric in sklearn to evaluate my KMeans model. I am using matplotlib to produce and export the entire plot into HTML that is going to be viewed in a client-side code (a dashboard). I noticed that my code (modified from sklearn's docs) generates only the last subplot and not the entire plot.
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import base64
from io import BytesIO
print(__doc__)
# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
def silhouette(X, range_n_clusters=[2, 3, 4, 5, 6]):
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
print(cluster_labels, 36)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
tmpfile = BytesIO()
plt.savefig(tmpfile, format='png')
encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
html = '<img src=\'data:image/png;base64,{}\'>'.format(encoded)
return html
Any ideas on how to export the entire plot and what is missing on the upper snippet?
I only had to use fig instead of plot to solve the problem. Here is how:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import base64
from io import BytesIO
print(__doc__)
# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
def silhouette(X, range_n_clusters=[2, 3, 4, 5, 6]):
plts = []
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
print(cluster_labels, 36)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plts.append(fig)
results = []
for fig in plts:
tmpfile = BytesIO()
fig.savefig(tmpfile, format='png')
tmpfile.seek(0)
encoded = base64.b64encode(tmpfile.getvalue()).decode('utf-8')
html = '<img src=\'data:image/png;base64,{}\'>'.format(encoded)
results.append(html)
return results
This program predicts the cluster to which the coordinates belong to, where it divides the given points into two clusters 0 and 1.
How do I get the accuracy of this model for the variable - prediction
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
#from sklearn.metrics import accuracy_score
X = np.array([[1, 2],[5, 8],[1.5, 1.8],[8, 8],[6,7],[9, 11]])
print(X)
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print("Centroids :\n ",centroids)
print("Labels : ",labels)
colors = ["g.","r.","c.","y."]
for i in range(len(X)):
print("coordinate:",X[i], "label:", labels[i])
plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths
= 5, zorder = 10)
plt.show()
prediction=kmeans.predict ( [ [ 5,6 ] ] )
print(prediction)
If you know the correct values for the coordinates' labels, you can use scikit-learn's accuracy_score:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_true, y_pred))
This does seem tricky for a clustering problem though. Think about how you would determine whether a prediction is correct or not and calculate the accuracy around that.
I am attempting to do automatic image segmentation of the different regions of a 2D MR image based on pixel intensity values. The first step is implementing a Gaussian Mixture Model on the image's histogram.
I need to plot the resulting gaussian obtained from the score_samples method onto the histogram. I have tried following the code in the answer to (Understanding Gaussian Mixture Models).
However, the resulting gaussian fails to match the histogram at all. How do I get the gaussian to match the histogram?
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
# Read image
img = cv2.imread("test.jpg",0)
hist = cv2.calcHist([img],[0],None,[256],[0,256])
hist[0] = 0 # Removes background pixels
# Fit GMM
gmm = GaussianMixture(n_components = 3)
gmm = gmm.fit(hist)
# Evaluate GMM
gmm_x = np.linspace(0,255,256)
gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1,1)))
# Plot histograms and gaussian curves
fig, ax = plt.subplots()
ax.hist(img.ravel(),255,[1,256])
ax.plot(gmm_x, gmm_y, color="crimson", lw=4, label="GMM")
ax.set_ylabel("Frequency")
ax.set_xlabel("Pixel Intensity")
plt.legend()
plt.show()
I also attempted manually constructing the gaussians with sums.
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
def gauss_function(x, amp, x0, sigma):
return amp * np.exp(-(x - x0) ** 2. / (2. * sigma ** 2.))
# Read image
img = cv2.imread("test.jpg",0)
hist = cv2.calcHist([img],[0],None,[256],[0,256])
hist[0] = 0 # Removes background pixels
# Fit GMM
gmm = GaussianMixture(n_components = 3)
gmm = gmm.fit(hist)
# Evaluate GMM
gmm_x = np.linspace(0,255,256)
gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1,1)))
# Construct function manually as sum of gaussians
gmm_y_sum = np.full_like(gmm_x, fill_value=0, dtype=np.float32)
for m, c, w in zip(gmm.means_.ravel(), gmm.covariances_.ravel(), gmm.weights_.ravel()):
gauss = gauss_function(x=gmm_x, amp=1, x0=m, sigma=np.sqrt(c))
gmm_y_sum += gauss / np.trapz(gauss, gmm_x) * w
# Plot histograms and gaussian curves
fig, ax = plt.subplots()
ax.hist(img.ravel(),255,[1,256])
ax.plot(gmm_x, gmm_y, color="crimson", lw=4, label="GMM")
ax.plot(gmm_x, gmm_y_sum, color="black", lw=4, label="Gauss_sum", linestyle="dashed")
ax.set_ylabel("Frequency")
ax.set_xlabel("Pixel Intensity")
plt.legend()
plt.show()
With ax.hist(img.ravel(),255,[1,256], normed=True)
The issue was with passing the histogram rather than the array of pixel intensities to GaussianMixture.fit gmm = gmm.fit(hist).
I also found that a minimum of n_components = 6 is needed to visually fit this particular histogram.
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
# Read image
img = cv2.imread("test.jpg",0)
hist = cv2.calcHist([img],[0],None,[256],[0,256])
hist[0] = 0 # Removes background pixels
data = img.ravel()
data = data[data != 0]
data = data[data != 1] #Removes background pixels (intensities 0 and 1)
# Fit GMM
gmm = GaussianMixture(n_components = 6)
gmm = gmm.fit(X=np.expand_dims(data,1))
# Evaluate GMM
gmm_x = np.linspace(0,253,256)
gmm_y = np.exp(gmm.score_samples(gmm_x.reshape(-1,1)))
# Plot histograms and gaussian curves
fig, ax = plt.subplots()
ax.hist(img.ravel(),255,[2,256], normed=True)
ax.plot(gmm_x, gmm_y, color="crimson", lw=4, label="GMM")
ax.set_ylabel("Frequency")
ax.set_xlabel("Pixel Intensity")
plt.legend()
plt.show()