How to add legend to Matplotlib for cluster data? - python

How do I add legend to the plot over in my scenario? the parameter of text is the text = tfidf.transform(document) and the parameter of clusters are the unsupervised clusters ranging from 0 to 19 clusters and have their bag of words. How do I add the legend to the plots? It is indistinguishable that which color corresponds to which cluster.
def plot_tsne_pca(data, labels):
max_label = max(labels)
max_items = np.random.choice(range(data.shape[0]), size=3000, replace=False)
pca = PCA(n_components=2).fit_transform(data[max_items,:].todense())
tsne = TSNE().fit_transform(PCA(n_components=50).fit_transform(data[max_items,:].todense()))
idx = np.random.choice(range(pca.shape[0]), size=3000, replace=False)
label_subset = labels[max_items]
label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
f, ax = plt.subplots(1, 2, figsize=(20, 6))
ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
ax[0].set_title('PCA Cluster Plot')
ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
ax[1].set_title('TSNE Cluster Plot')
plot_tsne_pca(text, clusters)
Here is the full example of the code: https://pastebin.com/3PABg7xh

You can use legend_elements() to automatically return the lists of artists/labels (or a subset thereof) for legend creation. See Automated legend creation for more details
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn import (manifold, datasets)
digits = datasets.load_digits(n_class=6)
X = digits.data
y = digits.target
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
X_tsne = tsne.fit_transform(X)
fig, ax = plt.subplots()
sc = ax.scatter(X_tsne[:,0], X_tsne[:,1], c=y, cmap='tab10')
ax.legend(*sc.legend_elements(), title='clusters')
EDIT
In your particular case, the code was not working because legend_elements() is meant to be used when you have a mapping between a numeric c= list and a colormap. But instead, you were passing a list of colors that you constructed by hand (label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]). If you remove that line and keep a numeric label_subset and map it to colors using cmap= then everything works as expected
def plot_tsne_pca(data, labels, sizelist, cmap='tab10'):
max_label = max(labels)
max_items = np.random.choice(range(data.shape[0]), sizelist, replace=False)
pca = PCA(n_components=2).fit_transform(data[max_items, :].todense())
tsne = TSNE().fit_transform(PCA(n_components=1).fit_transform(data[max_items, :].todense()))
idx = np.random.choice(range(pca.shape[0]), sizelist, replace=False)
label_subset = labels[max_items]
#label_subset = [cm.hsv(i / max_label) for i in label_subset[idx]]
f, ax = plt.subplots(1, 2, figsize=(20, 6))
ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset, cmap=cmap)
ax[0].set_title('PCA Cluster Plot')
sc = ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset, cmap=cmap)
ax[1].set_title('TSNE Cluster Plot')
ax[1].legend(*sc.legend_elements(), title='clusters')
plot_tsne_pca(text, clusters, sizelist)

Related

Double for loop to add multiple subplots on same figure

I am working with a clustering analysis problem. My goal is to create a double for loop which changes the numbers of clusters (3 different values for clusters) as well as cycling between the three linkage types per value cluster value. Then plot all of the subplots on the same figure.
I am hoping to achieve a 3x3 view of the subplots. Where each value of cluster is on the x-axis and each type of linkage correlating to the number of clusters is displayed down the y-axis.
The csv file I am working with is simply two columns with x1 and x2 values. I exluded the code where im import and read the csv file. The code I have thus far is as follows:
X1 = input_data.X1.values
X2 = input_data.X2.values
X = np.column_stack((X1, X2))
clusters = 4
Y_Kmeans = KMeans(n_clusters = clusters)
Y_Kmeans.fit(X)
Y_Kmeans_labels = Y_Kmeans.labels_
Y_Kmeans_silhouette = metrics.silhouette_score(X, Y_Kmeans_labels, metric='sqeuclidean')
linkage_types = ['ward', 'average', 'complete']
Y_hierarchy = AgglomerativeClustering(linkage=linkage_types[0], n_clusters=clusters)
Y_hierarchy.fit(X)
Y_hierarchy_labels = Y_hierarchy.labels_
Y_hierarchy_silhouette = metrics.silhouette_score(X, Y_hierarchy_labels,
metric='sqeuclidean')
I have tried this and am not getting the desired results:
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 12))
plt.subplots_adjust(hspace=0.5)
cluster = [4, 7, 10]
link = [0, 1, 2]
for i in cluster:
for j in link:
plt.scatter(X[:, 0], X[:, 1], c=colormap[Y_hierarchy_labels])
This is the output:
I see two problems:
you have to make calculations inside for-loops - and use i,j in KMeans(n_clusters=i) and AgglomerativeClustering(linkage=linkage_types[j], n_clusters=i)
you have to enumerate() cluster and link in for-loops to get ax = axs[number_cluster, number_link] and draw ax.scatter()
Minimal working code with random data.
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 12))
plt.subplots_adjust(hspace=0.5)
cluster = [4, 7, 10]
link = [0, 1, 2]
for number_cluster, i in enumerate(cluster):
# Y_Kmeans = KMeans(n_clusters=i)
# ... code ...
for number_link, j in enumerate(link):
# Y_hierarchy = gglomerativeClustering(linkage=linkage_types[j], n_clusters=i)
# ... code ...
X = np.random.rand(3+j, 3+i)
print(X[:, 0], X[:, 1])
ax = axs[number_cluster, number_link]
ax.scatter(X[:, 0], X[:, 1], )
ax.set_title(f'cluster: {i}, link: {j}')
plt.show()

How to get a smooth continuous pdf of a Bayesian Gaussian mixture model?

Given the code below
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import BayesianGaussianMixture
df = pd.read_csv("dataset", delimiter=" ")
data = df.to_numpy()
X_train = np.reshape(data, (10*data.shape[0],2))
bgmm = BayesianGaussianMixture(n_components=15,
random_state=7,
max_iter=5000,
n_init=10,
weight_concentration_prior_type="dirichlet_distribution")
bgmm.fit(X_train)
logprob = bgmm.score_samples(X_train)
pdf = np.exp(logprob)
x = np.linspace(0, 1, num=20)
plt.plot(x, pdf, '-k', label='Mixture PDF')
plt.show()
I get the following discrete pdf:
How can I plot a smooth continuous version of this pdf?
Edit:
Here is the the dataset:
[[6.11507621 6.2285484 ]
[5.61154419 7.4166868 ]
[5.3638034 8.64581576]
[8.58030274 6.01384676]
[2.06883754 8.5662325 ]
[7.772149 2.29177372]
[0.66223423 0.01642353]
[7.42461573 5.46288677]
[0.82355307 3.60322705]
[1.12966405 9.54888118]
[4.34716189 3.63203485]
[7.95368286 5.74659859]
[3.21564946 3.67576324]
[6.48021187 7.35190659]
[3.02668358 4.41981514]
[0.01745485 7.49153586]
[1.08490595 0.91004064]
[1.89995405 0.38728879]
[4.40549506 2.48715052]
[4.52857064 1.24935027]]
If the data are x and y values in 2D, you could try the following code to start experimenting:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.mixture import BayesianGaussianMixture
data = np.array([[6.11507621, 6.2285484], [5.61154419, 7.4166868], [5.3638034, 8.64581576], [8.58030274, 6.01384676],
[2.06883754, 8.5662325], [7.772149, 2.29177372], [0.66223423, 0.01642353], [7.42461573, 5.46288677],
[0.82355307, 3.60322705], [1.12966405, 9.54888118], [4.34716189, 3.63203485], [7.95368286, 5.74659859],
[3.21564946, 3.67576324], [6.48021187, 7.35190659], [3.02668358, 4.41981514], [0.01745485, 7.49153586],
[1.08490595, 0.91004064], [1.89995405, 0.38728879], [4.40549506, 2.48715052], [4.52857064, 1.24935027]])
X_train = data
bgmm = BayesianGaussianMixture(n_components=15,
random_state=7,
max_iter=5000,
n_init=10,
weight_concentration_prior_type="dirichlet_distribution")
bgmm.fit(X_train)
# create a mesh of points with x and y values going from -1 to 11
x, y = np.meshgrid(np.linspace(-1, 11, 30), np.linspace(-1, 11, 30))
# recombine x and y to tuples
xy = np.array([x.ravel(), y.ravel()]).T
logprob = bgmm.score_samples(xy)
pdf = np.exp(logprob).reshape(x.shape)
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 6))
# show the result of bgmm.score_samples on a mesh
ax1.imshow(pdf, extent=[-1, 11, -1, 11], cmap='Blues', interpolation='bilinear', origin='lower')
# show original data in red
ax1.scatter(data[:, 0], data[:, 1], color='red')
ax1.set_title('BayesianGaussianMixture')
# create a seaborn kdeplot from the same data
sns.kdeplot(x=data[:, 0], y=data[:, 1], fill=True, ax=ax2)
ax2.scatter(data[:, 0], data[:, 1], color='red')
ax2.set_aspect('equal', 'box')
ax2.set_xlim(-1, 11)
ax2.set_ylim(-1, 11)
ax2.set_title('Seaborn kdeplot')
plt.tight_layout()
plt.show()

3D plotting of a dataset that uses K-means

X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3) # Number of clusters == 3
kmeans = kmeans.fit(X) # Fitting the input data
labels = kmeans.predict(X) # Getting the cluster labels
centroids = kmeans.cluster_centers_ # Centroid values
print("Centroids are:", centroids) # From sci-kit learn
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
x = np.array(labels==0)
y = np.array(labels==1)
z = np.array(labels==2)
ax.scatter(x,y,z, marker="s"[kmeans.labels_], s=40, cmap="RdBu")
I am trying to Plot the clusters in 3D by colouring all labels belonging to their class, and plot the centroids using a separate symbol. I managed to get the KMeans technique working, atleast I believe I did. But I'm stuck trying to plot it in 3D. I believe there can be a simple solution I'm just not seeing it. Does anyone have any idea what I need to change in my solution to achieve this?
import matplotlib.pyplot as plt
from sklearn.datasets import make_swiss_roll
from mpl_toolkits.mplot3d import Axes3D
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3) # Number of clusters == 3
kmeans = kmeans.fit(X) # Fitting the input data
labels = kmeans.predict(X) # Getting the cluster labels
centroids = kmeans.cluster_centers_ # Centroid values
# print("Centroids are:", centroids) # From sci-kit learn
fig = plt.figure(figsize=(10,10))
ax = fig.gca(projection='3d')
x = np.array(labels==0)
y = np.array(labels==1)
z = np.array(labels==2)
ax.scatter(centroids[:,0],centroids[:,1],centroids[:,2],c="black",s=150,label="Centers",alpha=1)
ax.scatter(X[x,0],X[x,1],X[x,2],c="blue",s=40,label="C1")
ax.scatter(X[y,0],X[y,1],X[y,2],c="yellow",s=40,label="C2")
ax.scatter(X[z,0],X[z,1],X[z,2],c="red",s=40,label="C3")
Try with this, now the clusters are black X:
from sklearn.datasets import make_swiss_roll
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3) # Number of clusters == 3
kmeans = kmeans.fit(X) # Fitting the input data
labels = kmeans.predict(X) # Getting the cluster labels
centroids = kmeans.cluster_centers_ # Centroid values
print("Centroids are:", centroids) # From sci-kit learn
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
x = np.array(labels==0)
y = np.array(labels==1)
z = np.array(labels==2)
ax.scatter(X[x][:, 0], X[x][:, 1], X[x][:, 2], color='red')
ax.scatter(X[y][:, 0], X[y][:, 1], X[y][:, 2], color='blue')
ax.scatter(X[z][:, 0], X[z][:, 1], X[z][:, 2], color='yellow')
ax.scatter(centroids[:, 0], centroids[:, 1], centroids[:, 2],
marker='x', s=169, linewidths=10,
color='black', zorder=50)

Vertical "broken" bar plot with arrays as bar heights and color coding

I am trying to create a bar plot that looks like this:
x axis is the number of detectors hit in coincidence (i.e. multiplicity)
for each multiplicity i have several events. The y axis contains the average pulse height of each event.The colors should correspond to the number of hits which have the shown pulse heights and appeared in events with the respective multiplicity
I have a dictionary that has multiplicities as keys and arrays of the avarage pulse heights as values. :
averages = {2 : [...],
3 : [...],
4 : [...],
5 : [...],
6 : [...],}
for key in averages:
plt.bar(key,averages[key] ,width = 0.8)
i only know how to produce the simple version of a bar chart that looks like this:
can someone tell me how to make the bars "broken to show all pulse heights and add the color coding?
Not entirely clear but I think you want something like this
import seaborn as sns
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
# Create some fake data that looks roughly like what you have
tips = sns.load_dataset("tips")
weights = stats.gaussian_kde(tips["total_bill"])(tips["total_bill"])
tips = tips.sample(frac=50, weights=weights, replace=True)
days = []
segments = []
counts = []
for day, x in tips["total_bill"].groupby(tips["day"]):
days.append(day)
segments.append(np.sort(x.unique()))
counts.append(x.value_counts().sort_index())
# Map from counts to colors
norm = mpl.colors.Normalize(0, np.concatenate(counts).max())
colors = [mpl.cm.viridis(norm(c)) for c in counts]
f, ax = plt.subplots()
# Draw each horizontal line
events = ax.eventplot(segments, colors=colors, orientation="vertical", zorder=.5)
events[0].set_norm(norm)
f.colorbar(events[0])
# Add the mean/std for each x position
sns.pointplot(data=tips, x="day", y="total_bill", ci="sd", order=days, join=False, color=".1")
I took the question to need each horizontal line to represent each data value, but if you're satisfied with a histogram, this is two function calls in seaborn (>=0.11)
sns.histplot(
data=tips, x="day", y="total_bill",
discrete=(True, False), binwidth=(1, .5),
cmap="viridis", cbar=True, zorder=.5, alpha=.75,
)
sns.pointplot(
data=tips, x="day", y="total_bill",
ci="sd", order=days, join=False, color=".1",
)
Here is a solution which uses imshow to produce the columnwise "color histograms":
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
# Create dummy data
coincidences = [2, 3, 4, 5, 6]
n_list = [10000, 8000, 6000, 4000, 2000]
mu_list = np.array([200, 300, 400, 500, 600])
scale = 100
averages = {c: np.random.normal(loc=mu_list[i], scale=scale, size=n_list[i])
for i, c in enumerate(coincidences)}
# Calculate histogram for each column
bins = np.linspace(0, 1000, 1000)
hist_img = np.array([np.histogram(averages[c], bins=bins)[0]
for c in coincidences]).T
# Create Normalized colormap
# norm = mpl.colors.Normalize()
norm = mpl.colors.LogNorm(vmin=1, vmax=hist_img.max())
sm = mpl.cm.ScalarMappable(cmap='viridis', norm=norm)
# Use colormap for img_hist and make zeros transparent
hist_img2 = sm.to_rgba(hist_img, bytes=True)
hist_img2[hist_img == 0, 3] = 0
# Plot
fig, ax = plt.subplots()
cc = ax.imshow(hist_img2, aspect='auto', interpolation='none', origin='lower',
extent=[1.5, 6.5, 0, 1000])
plt.colorbar(sm)
mean = [np.mean(averages[c]) for c in coincidences]
std = [np.std(averages[c]) for c in coincidences]
ax.errorbar(coincidences, mean, yerr=std, ls='', c='k', capsize=3, label='std')
ax.plot(coincidences, mean, ls='', marker='o', c='b', label='mean')
ax.legend()

Draw lines connecting points between two separate one-D plots

As title, I am working on time-series alignment, and a visualization of the alignment result is desired.
To this end, I want to draw lines connecting "anchor points" generated by the alignment algorithm.
np.random.seed(5)
x = np.random.rand(10) # time-series 1
y = np.random.rand(20) # time-series 2
ap = np.array(([0, 4, 9], # the anchor points
[0, 9, 19]))
fig = plt.figure(figsize=(10,5))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
ax1.plot(x, 'r')
ax2.plot(y, 'g')
the anchor points ap in the example specify the one-to-one "mapping" between the indices of two time series x and y, i.e., x[0] is corresponding to y[0]; x[4] to y[9]; and x[9] to y[19]. The goal is to draw lines between two separate plot to show the result of the alignment.
To connect two subplots in matplotlib you may use a ConnectionPatch.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch
np.random.seed(5)
x = np.random.rand(21) # time-series 1
y = np.random.rand(21) # time-series 2
ap = np.array(([0, 5, 10], # the anchor points
[0,10, 20]))
fig = plt.figure(figsize=(10,5))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
ax1.plot(x, 'r')
ax2.plot(y, 'g')
ls = ["-","--"]
c = ["gold", "blue"]
for i, row in enumerate(ap):
for j, ind in enumerate(row):
px = (ind, x[ind])
py = (ind, y[ind])
con = ConnectionPatch(py,px, coordsA="data", coordsB="data",
axesA=ax2, axesB=ax1, linestyle=ls[i], color=c[i])
ax2.add_artist(con)
plt.show()
Thanks to #ImportanceOfBeingErnest, I identified the typo in the OP and achieved connecting indices between two series of different length:
np.random.seed(5)
x = np.random.rand(10)
y = np.random.rand(20)
ap = np.array(([0, 4, 9],
[0,9, 19]))
fig = plt.figure(figsize=(10,5))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212, sharex=ax1)
ax1.plot(x, 'r')
ax2.plot(y, 'g')
plt.setp(ax1.get_xticklabels(), visible=False)
for j in ap.T:
ax1.axvline(x=j[0], linestyle='--', color='k')
ax2.axvline(x=j[1], linestyle='--', color='k')
x_ind = (j[0], ax1.get_ylim()[0])
y_ind = (j[1], ax2.get_ylim()[1])
con = ConnectionPatch(y_ind, x_ind, coordsA="data", coordsB="data",
axesA=ax2, axesB=ax1, linewidth='1.5')
ax2.add_artist(con)
I know it is off the topic, but how to further truncate the blank part in order to make the range of x-axis fit the signal length, while maintain the actual ratio of the length of the two signals? Though sharex=ax1 shows the ratio of signal length, the blank part on the right of the top figure is annoying.

Categories

Resources