Organizing Clusters in K-means clustering

Organizing Clusters in K-means clustering - python

I am using python for k-means clustering for Mnist database(http://yann.lecun.com/exdb/mnist/). I am able to successfully cluster the data but unable to label the clusters. Meaning, I am unable to see that what cluster number holds what digit. For example cluster 5 can hold digit 7.
I need to write a code to correctly label the clusters after the k-means clustering has been done. Also need to add a legend to the code.
from __future__ import division, print_function, absolute_import
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D #only needed for 3D plots
#scikit learn
from sklearn.cluster import KMeans
#pandas to read excel file
import pandas
import xlrd
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
Links:
[MNIST Dataset] http://yann.lecun.com/exdb/mnist/
df = pandas.read_csv('test_encoded_with_label.csv',header=None,
delim_whitespace=True)
#df = pandas.read_excel('test_encoded_with_label.xls')
#print column names
print(df.columns)
df1 = df.iloc[:,0:2] #0 and 1, the last index is not used for iloc
labels = df.iloc[:,2]
labels = labels.values
dataset = df1.values
#train indices - depends how many samples
trainidx = np.arange(0,9999)
testidx = np.arange(0,9999)
train_data = dataset[trainidx,:]
test_data = dataset[testidx,:]
train_labels = labels[trainidx] #just 1D, no :
tpredct_labels = labels[testidx]
kmeans = KMeans(n_clusters=10, random_state=0).fit(train_data)
kmeans.labels_
#print(kmeans.labels_.shape)
plt.scatter(train_data[:,0],train_data[:,1], c=kmeans.labels_)
predct_labels = kmeans.predict(train_data)
print(predct_labels)
print('actual label', tpredct_labels)
centers = kmeans.cluster_centers_
print(centers)
plt.show()

To create markers to find cluster of labelled points, you can use the annotate method
Here is a sample code run on sklearn digits dataset where I try to mark the centroids of the resultant clustering. Note that I just label the clusters from 0-9 just for illustrative purpose:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
np.random.seed(42)
digits = load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target
h = .02
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)
centroids = kmeans.cluster_centers_
plt_data = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_, cmap=plt.cm.get_cmap('Spectral', 10))
plt.colorbar()
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x')
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
'Centroids are marked with white cross')
plt.xlabel('component 1')
plt.ylabel('component 2')
labels = ['{0}'.format(i) for i in range(10)]
for i in range (10):
xy=(centroids[i, 0],centroids[i, 1])
plt.annotate(labels[i],xy, horizontalalignment='right', verticalalignment='top')
plt.show()
This is the result you get:

To add the legend, try:
plt.scatter(train_data[:,0], train_data[:,1], c=kmeans.labels_, label=kmeans.labels_)
plt.legend()

Related

Why did sns.scatterplot produce a different output compared to plt.scatter on the same dataset

I tried to visualise the PCA transformed data of the MNIST Digit dataset using sns.scatterplot and plt.scatter approaches as below
from keras.datasets import mnist
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
(X_train, y_train), (X_test, y_test) = mnist.load_data()
dim_1 = X_train.shape[0]
dim_2 = X_train.shape[1]
dim_3 = X_train.shape[2]
arr = X_train.reshape(dim_1, dim_2 * dim_3)
sc = StandardScaler()
norm_arr = sc.fit_transform(arr)
pca = PCA(n_components=2)
pca_arr = pca.fit_transform(norm_arr)
pca_arr = np.vstack((pca_arr.T, y_train)).T
pca_df = pd.DataFrame(data=pca_arr, columns=("1st_principal", "2nd_principal", "label"))
pca_df = pca_df.astype({'label': 'int32'})
Using scatterplot from matplotlib produces this visual:
sns.FacetGrid(pca_df, hue="label", height=6).map(plt.scatter, '1st_principal', '2nd_principal').add_legend()
plt.show()
On the other hand, scatterplot from seaborn is quite different, in particular the locations of digits 9 (on the bottom right hand corner instead of upper left hand corner compared to the first plot.
plt.figure(figsize=(7,7))
sns.scatterplot(x = pca_arr_combo[:, 0], y = pca_arr_combo[:, 1],
hue = pca_arr_combo[:, 2], palette = sns.hls_palette(10), legend = 'full')
plt.show()
Can someone please explain why 2 different visuals can be produced on the same dataset? I was wondering sns.FacetGrid had something to do with it, but not sure why? Which scatterplot was correct?
Thanks.

IndexError: only integers, slices (`:`)

I am trying to utilize k means clustering in Python and I am encountering an error. The dataset that I am working with can be found here:
https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle
import pandas as pd
import seaborn as sns
import numpy as np
#from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
from sklearn.preprocessing import scale, StandardScaler
import sklearn
from sklearn.cluster import KMeans
df = pd.read_csv('vehicles.csv')
df2 = df[['comb08','youSaveSpend']].copy()
scaler = StandardScaler()
scaler.fit(df2)
scaled_array = scaler.transform(df2)
average = np.mean(scaled_array[:,0])
std = np.std(scaled_array[:,0])
df2 = scaled_array
max_clusters = 10
noClusters = range(1, max_clusters + 1)
kmeans = [KMeans(n_clusters = i) for i in noClusters]
score = [kmeans[i].fit(df6).score(df2) for i in range(len(kmeans))]
plt.plot(noClusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel("Score")
plt.title('Elbow Curve')
kmeans = KMeans(n_clusters = 10, random_state = 0)
kmeans = kmeans.fit(scaled_array)
unscaled = scaler.inverse_transform(kmeans.cluster_centers_)
unscaled
centroids = pd.DataFrame({'centroidx':unscaled[:,0],'centroidy':unscaled[:,1]})
df2['label'] = kmeans.labels_.astype(np.int)
df2.head() # <======== Error Occurs Here
plt.scatter(df2['comb08'], df2['youSaveSpend'], c=df2.label) # (x,y,color)
plt.scatter(centroids['centroidx'], \
centroids['centroidy'], c='red') # (x,y,color)
plt.show() <======== Error also Occurs Here
The error I get is the below:
When I try to convert my floats like so:
df2 = df.apply(pd.to_numeric)
I get this error:

Cluster using different colours and labels

I am working on text clustering. I would need to plot the data using different colours.
I used kmeans method for clustering and tf-idf for similarity.
kmeans_labels =KMeans(n_clusters=3).fit(vectorized_text).labels_
pipeline = Pipeline([('tfidf', TfidfVectorizer())])
X = pipeline.fit_transform(X_train['Sentences']).todense()
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
plt.scatter(data2D[:,0], data2D[:,1])
kmeans.fit(X)
centers2D = pca.transform(kmeans.cluster_centers_)
labels=np.array([kmeans.labels_])
Currently, my output looks like: there are a few elements as it is a test.
I would need to add labels (they are strings) and differentiate dots by clusters: each cluster should have its own colour to make the reader easy to analyse the chart.
Could you please tell me how to change my code in order to include both labels and colours? I think any example it would be great.
A sample of my dataset is (the output above was generated from a different sample):
Sentences
Where do we do list them? ...
Make me a list of the things we would need and I'll take you into town. ...
Do you have a list yet? ...
The first was a list for Howie. ...
You're not on my list tonight. ...
I'm gonna print this list on my computer, given you're always bellyaching about my writing.

We can use an example dataset:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
newsgroups = fetch_20newsgroups(subset='train',
categories=['talk.religion.misc','sci.space', 'misc.forsale'])
X_train = newsgroups.data
y_train = newsgroups.target
pipeline = Pipeline([('tfidf', TfidfVectorizer(max_features=5000))])
X = pipeline.fit_transform(X_train).todense()
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
And do KMeans like you did, obtaining the clusters and centers, so just adding a name for the cluster:
kmeans =KMeans(n_clusters=3).fit(X)
centers2D = pca.transform(kmeans.cluster_centers_)
labels=kmeans.labels_
cluster_name = ["Cluster"+str(i) for i in set(labels)]
You can add the colors by providing the cluster to "c=" and calling a colormap from cm or defining you own map:
plt.scatter(data2D[:,0], data2D[:,1],c=labels,cmap='Set3',alpha=0.7)
for i, txt in enumerate(cluster_name):
plt.text(centers2D[i,0], centers2D[i,1],s=txt,ha="center",va="center")
You can also consider using seaborn:
sns.scatterplot(data2D[:,0], data2D[:, 1], hue=labels, legend='full',palette="Set1")

Picking up on your code try the following:
kmeans_labels =KMeans(n_clusters=3).fit(vectorized_text).labels_
pipeline = Pipeline([('tfidf', TfidfVectorizer())])
X = pipeline.fit_transform(X_train['Sentences']).todense()
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
kmeans.fit(X)
centers2D = pca.transform(kmeans.cluster_centers_)
group = kmeans.labels_
cdict = {0: 'red', 1: 'blue', 2: 'green'}
ldict = {0: 'label_1', 1: 'label_2', 2: 'label_3'}
fig, ax = plt.subplots()
for g in np.unique(group):
ix = np.where(group == g)
ax.scatter(data2D[:,0][ix], data2D[:,1][ix], c=cdict[g], label=ldict[g], s=100)
ax.legend()
plt.show()
I'm assuming your kmeans has n_clusters=3. The cdict and ldict need to be set up accordingly with the number of clusters. In this case cluster 0 will be red with label label_1, cluster 1 will be blue with label label_2 and so on.
EDIT: I changed the keys of cdict to start from 0.
EDIT 2: Added labels.

No runtime error, but wrong iris PCA plotting

I am using following code to perform PCA on iris dataset:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# get iris data to a dataframe:
from sklearn import datasets
iris = datasets.load_iris()
varnames = ['SL', 'SW', 'PL', 'PW']
irisdf = pd.DataFrame(data=iris.data, columns=varnames)
irisdf['Species'] = [iris.target_names[a] for a in iris.target]
# perform pca:
from sklearn.decomposition import PCA
model = PCA(n_components=2)
scores = model.fit_transform(irisdf.iloc[:,0:4])
loadings = model.components_
# plot results:
scoredf = pd.DataFrame(data=scores, columns=['PC1','PC2'])
scoredf['Grp'] = irisdf.Species
sns.lmplot(fit_reg=False, x="PC1", y='PC2', hue='Grp', data=scoredf) # plot point;
loadings = loadings.T
for e, pt in enumerate(loadings):
plt.plot([0,pt[0]], [0,pt[1]], '--b')
plt.text(x=pt[0], y=pt[1], s=varnames[e], color='b')
plt.show()
I am getting following plot:
However, when I compare with plots from other sites (e.g. at http://marcoplebani.com/pca/ ), my plot is not correct. Following differences seem to be present:
Petal length and petal width lines should have similar lengths.
Sepal length line should be closer to petal length and petal width lines rather than closer to sepal width line.
All 4 lines should be on the same side of x-axis.
Why is my plot not correct. Where is the error and how can it be corrected?

It depends on whether you scale the variance or not. The "other site" uses scale=TRUE. If you want to do this with sklearn, add StandardScaler before fitting the model and fit the model with scaled data, like this:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(irisdf.iloc[:,0:4])
scores = model.fit_transform(X)
Edit: Difference between StandardScaler and normalize
Here is an answer which pointed out a key difference (row vs column). Even you use normalize here, you might want to consider X = normalize(X.T).T. The following code shows some differences after transformation:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
iris = datasets.load_iris()
varnames = ['SL', 'SW', 'PL', 'PW']
fig, ax = plt.subplots(2, 2, figsize=(16, 12))
irisdf = pd.DataFrame(data=iris.data, columns=varnames)
irisdf.plot(kind='kde', title='Raw data', ax=ax[0][0])
irisdf_std = pd.DataFrame(data=StandardScaler().fit_transform(irisdf), columns=varnames)
irisdf_std.plot(kind='kde', title='StandardScaler', ax=ax[0][1])
irisdf_norm = pd.DataFrame(data=normalize(irisdf), columns=varnames)
irisdf_norm.plot(kind='kde', title='normalize', ax=ax[1][0])
irisdf_norm = pd.DataFrame(data=normalize(irisdf.T).T, columns=varnames)
irisdf_norm.plot(kind='kde', title='normalize', ax=ax[1][1])
plt.show()
I'm not sure how deep I can go with the algorithm/math. The point for StandardScaler is to get uniform/consistent mean and variance across features. The assumption is that variables with large measurement units are not necessarily (and should not be) dominant in PCA. In other word, StandardScaler makes features contribute equally to PCA. As you can see, normalize won't give consistent mean or variance.

Fuzzy clustering on Python with Iris dataset

I am working on fuzzy c-means clustering of iris dataset, however can not visualize due to some errors.Using this tutorial I wrote the following for the iris, however it shows error called "AttributeError: shape". This is my code:
from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as sm
import skfuzzy as fuzz
iris = datasets.load_iris()
x = pd.DataFrame(iris.data, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
y = pd.DataFrame(iris.target, columns=['Target'])
plt.figure(figsize=(6, 3))
model =fuzz.cluster.cmeans(iris,3,2,error=0.005,maxiter=1000,init=None,seed=None)
model.fit(x)
plt.show()
I assumed that passing the parameter in the variable model would be enough, however it shows above error. If possible, could you show where I make a mistake ? How to fix this ? I really appreciate your help!

I tried pre-processing the data first, I created a good plot, I simply followed the tutorials, and I perform SVD to reduce the dimension into two, then I started to plot, it seems that for the tutorials you only need two dimensions (x,y). You do not need to do the model.fit() I have not found this kind of command in the documentation, here is the code:
import numpy as np, pandas as pd, os
import matplotlib
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import [![TruncatedSVD
from skle][1]][1]arn.preprocessing import Normalizer
import skfuzzy as fuzz
from sklearn import datasets
################################################################################
iris = datasets.load_iris()
x = pd.DataFrame(iris.data, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
y = pd.DataFrame(iris.target, columns=['Target'])
scaler = StandardScaler()
X_std = scaler.fit_transform(x)
lsa = TruncatedSVD(2, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(X_std)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
a= pd.DataFrame(dtm_lsa, columns = ["component_1","component_2"])
a['targets']=y
fig1, axes1 = plt.subplots(3, 3, figsize=(8, 8))
alldata = np.vstack((a['component_1'], a['component_2']))
fpcs = []
colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen']
for ncenters, ax in enumerate(axes1.reshape(-1), 2):
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
alldata, ncenters, 2, error=0.005, maxiter=1000, init=None)
# Store fpc values for later plots
fpcs.append(fpc)
# Plot assigned clusters, for each data point in training set
cluster_membership = np.argmax(u, axis=0)
for j in range(ncenters):
ax.plot(a['component_1'][cluster_membership == j],
a['component_2'][cluster_membership == j], '.', color=colors[j])
# Mark the center of each fuzzy cluster
for pt in cntr:
ax.plot(pt[0], pt[1], 'rs')
ax.set_title('Centers = {0}; FPC = {1:.2f}'.format(ncenters, fpc))
ax.axis('off')
fig1.tight_layout()
fig1.savefig('iris_dataset.png')

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Organizing Clusters in K-means clustering - python

To add the legend, try: plt.scatter(train_data[:,0], train_data[:,1], c=kmeans.labels_, label=kmeans.labels_) plt.legend()

Related

Why did sns.scatterplot produce a different output compared to plt.scatter on the same dataset

IndexError: only integers, slices (`:`)

Cluster using different colours and labels

No runtime error, but wrong iris PCA plotting

Fuzzy clustering on Python with Iris dataset

Categories

Resources