k_model = KMeans(n_clusters = 3).fit(actor_w2vec)
cluster_dict = {i: np.where(k_model.labels_ == i)[0] for i in range(k_model.n_clusters)}
I have applied KMeans on word2vec vector (3411x128). cluster_dict contains the cluster label(i.e. 0,1,2) as key and index number(1,2,3,4,....3411) as value such that these values are distributed among three clusters.
Now i want to visualize these cluster so i used TSNE to reduce the 128 dimension vector to 2 dimension
node_embeddings = actor_w2vec
transform = TSNE #PCA
trans = transform(n_components=2)
node_embeddings_2d = trans.fit_transform(node_embeddings)
but i don't know how combine these two in order to create a graph or scatter plot where all the point belonging to one cluster are combined together
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
#remember to scale your data if the ranges are too broad
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)
kmeans_model = KMeans(n_clusters=3, max_iter=500, random_state=42)
y_km = kmeans_model.fit_predict(scaled_features)
pca_model = PCA(n_components=2, random_state=42)
transformed = pca_model.fit_transform(scaled_features)
centers = pca_model.transform(kmeans_model.cluster_centers_)
fig = px.scatter(x=transformed[:, 0], y=transformed[:, 1], color=y_km)
fig.add_scatter(
x=centers[:, 0],
y=centers[:, 1],
marker=dict(size=20, color="LightSeaGreen"), name="Centers"
)
fig.show()
If you only do kmeans.fit(df), you could get the labels from kmeans.labels_
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns; sns.set()
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
plt.rcParams['figure.dpi'] = 150
# create dataset
X, y = make_blobs(
n_samples=150, n_features=2,
centers=3, cluster_std=0.5,
shuffle=True, random_state=0
)
# plot
plt.scatter(
X[:, 0], X[:, 1],
edgecolor='black', s=50
)
plt.show()
km = KMeans(
n_clusters=3, init='random',
n_init=10, max_iter=10000,
tol=1e-04, random_state=0
)
y_km = km.fit_predict(X)
plt.scatter(X[:,0], X[:,1], c=y_km, s=50, cmap=plt.cm.Paired, alpha=0.4)
plt.scatter(km.cluster_centers_[:, 0],km.cluster_centers_[:, 1],
s=250, marker='*', label='centroids',
edgecolor='black',
c=np.arange(0,3),cmap=plt.cm.Paired,)
The line import seaborn as sns; sns.set() is not necessary, it just makes a nicer style.
For plotting you can use matplotlib.pyplot . Furthermore you can look on the shape of your data with node_embeddings_2d.shape, so you can make sure that plt.scatter takes the right arguments.
Good luck! ;)
Related
I applied K_Mean clustering on data and after I applied TSNE to plot the data. I have 4 dimension and 4 groups. The problem is my K_mean is correct but why with tsne, the same group are not all together?
the code :
XX = df [["agent_os_new","agent_category_new","referer_new","agent_name_new"]]
y = df['referer_new']
y
cols = XX.columns
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
X = ms.fit_transform(XX)
X = pd.DataFrame(X, columns=[cols])
X[:4]
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=0)
ymeans = kmeans.fit(X)
ymeans
labels = kmeans.labels_
df_new = XX.assign(Cluster =labels)
df_new
from sklearn.manifold import TSNE
import seaborn as sns
X_embedded = TSNE(n_components=2).fit_transform(df_new)
df_subset = pd.DataFrame()
df_subset['tsne1'] = X_embedded[:,0]
df_subset['tsne2'] = X_embedded[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
x="tsne1", y="tsne2",
hue=df.label,
palette="Set1",
data=df_subset,
style=df_new["Cluster"],
legend="full",
s=120
)
what I want:
from sklearn.manifold import TSNE
import seaborn as sns
X_embedded = TSNE(n_components=2,random_state=42).fit_transform(X)
centers = np.array(kmeans.cluster_centers_)
model = KMeans(n_clusters = 4, init = "k-means++")
label = model.fit_predict(X_embedded)
plt.figure(figsize=(10,10))
uniq = np.unique(label)
for i in uniq:
plt.scatter(data[label == i , 0] , data[label == i , 1] , label = i)
plt.scatter(centers[:,0], centers[:,1], marker="x", color='k')
#This is done to find the centroid for each clusters.
plt.legend()
plt.show()
Let's take data following :
import numpy as np
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
Let's consider data
data = load_breast_cancer()
X = data.data
y = data.target
sc = StandardScaler()
I want to plot 3D plot of PCA with 3 components, however I'm only capable to do it for first two.
My work so far
scaler=StandardScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)
pca=PCA(n_components=3)
pca.fit(X_scaled)
X_pca=pca.transform(X_scaled)
ex_variance=np.var(X_pca,axis=0)
ex_variance_ratio = ex_variance/np.sum(ex_variance)
ex_variance_ratio
Xax=X_pca[:,0]
Yax=X_pca[:,1]
cdict={0:'red',1:'green'}
labl={0:'Malignant',1:'Benign'}
marker={0:'*',1:'o'}
alpha={0:.3, 1:.5}
fig,ax=plt.subplots(figsize=(7,5))
fig.patch.set_facecolor('white')
for l in np.unique(y):
ix=np.where(y==l)
ax.scatter(Xax[ix],Yax[ix],c=cdict[l],s=40,
label=labl[l],marker=marker[l],alpha=alpha[l])
# for loop ends
plt.xlabel("First Principal Component",fontsize=14)
plt.ylabel("Second Principal Component",fontsize=14)
plt.legend()
plt.show()
And in output we get :
Do you know how to perform this for third principial component ? For sure it should be 3D but I'm not sure how to do it...
It is quite simple actually-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
# %matplotlib notebook
data = load_breast_cancer()
X = data.data
y = data.target
sc = StandardScaler()
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
pca = PCA(n_components=3)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)
ex_variance=np.var(X_pca,axis=0)
ex_variance_ratio = ex_variance/np.sum(ex_variance)
ex_variance_ratio
Xax = X_pca[:,0]
Yax = X_pca[:,1]
Zax = X_pca[:,2]
cdict = {0:'red',1:'green'}
labl = {0:'Malignant',1:'Benign'}
marker = {0:'*',1:'o'}
alpha = {0:.3, 1:.5}
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111, projection='3d')
fig.patch.set_facecolor('white')
for l in np.unique(y):
ix=np.where(y==l)
ax.scatter(Xax[ix], Yax[ix], Zax[ix], c=cdict[l], s=40,
label=labl[l], marker=marker[l], alpha=alpha[l])
# for loop ends
ax.set_xlabel("First Principal Component", fontsize=14)
ax.set_ylabel("Second Principal Component", fontsize=14)
ax.set_zlabel("Third Principal Component", fontsize=14)
ax.legend()
plt.show()
Is this what you are after?
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3) # Number of clusters == 3
kmeans = kmeans.fit(X) # Fitting the input data
labels = kmeans.predict(X) # Getting the cluster labels
centroids = kmeans.cluster_centers_ # Centroid values
print("Centroids are:", centroids) # From sci-kit learn
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
x = np.array(labels==0)
y = np.array(labels==1)
z = np.array(labels==2)
ax.scatter(x,y,z, marker="s"[kmeans.labels_], s=40, cmap="RdBu")
I am trying to Plot the clusters in 3D by colouring all labels belonging to their class, and plot the centroids using a separate symbol. I managed to get the KMeans technique working, atleast I believe I did. But I'm stuck trying to plot it in 3D. I believe there can be a simple solution I'm just not seeing it. Does anyone have any idea what I need to change in my solution to achieve this?
import matplotlib.pyplot as plt
from sklearn.datasets import make_swiss_roll
from mpl_toolkits.mplot3d import Axes3D
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3) # Number of clusters == 3
kmeans = kmeans.fit(X) # Fitting the input data
labels = kmeans.predict(X) # Getting the cluster labels
centroids = kmeans.cluster_centers_ # Centroid values
# print("Centroids are:", centroids) # From sci-kit learn
fig = plt.figure(figsize=(10,10))
ax = fig.gca(projection='3d')
x = np.array(labels==0)
y = np.array(labels==1)
z = np.array(labels==2)
ax.scatter(centroids[:,0],centroids[:,1],centroids[:,2],c="black",s=150,label="Centers",alpha=1)
ax.scatter(X[x,0],X[x,1],X[x,2],c="blue",s=40,label="C1")
ax.scatter(X[y,0],X[y,1],X[y,2],c="yellow",s=40,label="C2")
ax.scatter(X[z,0],X[z,1],X[z,2],c="red",s=40,label="C3")
Try with this, now the clusters are black X:
from sklearn.datasets import make_swiss_roll
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3) # Number of clusters == 3
kmeans = kmeans.fit(X) # Fitting the input data
labels = kmeans.predict(X) # Getting the cluster labels
centroids = kmeans.cluster_centers_ # Centroid values
print("Centroids are:", centroids) # From sci-kit learn
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
x = np.array(labels==0)
y = np.array(labels==1)
z = np.array(labels==2)
ax.scatter(X[x][:, 0], X[x][:, 1], X[x][:, 2], color='red')
ax.scatter(X[y][:, 0], X[y][:, 1], X[y][:, 2], color='blue')
ax.scatter(X[z][:, 0], X[z][:, 1], X[z][:, 2], color='yellow')
ax.scatter(centroids[:, 0], centroids[:, 1], centroids[:, 2],
marker='x', s=169, linewidths=10,
color='black', zorder=50)
I have a small script to run in a jupyter notebook. The Kmeans seems to be working correctly, but my centroids are scaled down. How do I get them to display correctly on my plot? My x and y range from 0 to about 500 on each side.
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
import pandas as pd
plt.figure(figsize=(8, 6))
df = pd.read_csv("sales-by-week-4.csv")
df2 = DataFrame(df,columns=["Average Sale Price", "Average Weekly"])
plt.figure(figsize=(8, 6))
kmeans = KMeans(n_clusters=5).fit(scale(df2))
centroids = kmeans.cluster_centers_
print(centroids)
plt.scatter(df2["Average Weekly"], df2["Average Sale Price"], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
Here's my centroid print.
[[ 2.65044538 -0.37653707]
[-0.64002758 -0.25885017]
[-0.39559393 5.26965425]
[ 0.91316601 -0.29410492]
[-0.5276885 0.8949181 ]]
You fitted KMeans on your scaled dataframe. Try to fit on df2 only
I am trying to reproduce the example in this post, which produces this figure.
The colored regions above are plotted by mlxtend.plotting (version '0.14.0').
With the default settings on colab, this code
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X, y, clf=ppn)
produces this figure.
The data points have been plotted while the bottom region has not.
Is it possible to set the color for the bottom region with mlxtend.plotting?
it seems like a bug derived by the classification of two regions, if you try and separate 3 clusters as the following example it will work.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.data import iris_data
from mlxtend.plotting import plot_decision_regions
# Initializing Classifiers
clf1 = LogisticRegression(random_state=0)
clf2 = RandomForestClassifier(random_state=0)
clf3 = SVC(random_state=0, probability=True)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
weights=[2, 1, 1], voting='soft')
# Loading some example data
X, y = iris_data()
X = X[:,[0, 2]]
# Plotting Decision Regions
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10, 8))
labels = ['Logistic Regression',
'Random Forest',
'RBF kernel SVM',
'Ensemble']
for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
labels,
itertools.product([0, 1],
repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y,
clf=clf, legend=2)
plt.title(lab)
plt.show()
Try and ask directly on their github directory: https://github.com/rasbt/mlxtend
I think it's possible. You can use the colors parameter instead, I think it is much easier. You should try this one, is this what you are looking for?
fig = plot_decision_regions(
X=X,
y=y.astype(int),
clf=clf,
legend=2,
colors='yellow,red'
)