How to build effective K-means algoritham? - python

I have written a simple K-mean algorithm, But I am finding difficulty to explore it cluster by cluster.
Github Link: https://github.com/AkshayBayas/Machine-learning-/blob/master/K-Means%20algorithm.ipynb
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%pylab
Df = pd.read_csv('Kdata.csv')
from sklearn.cluster import KMeans
KModule = KMeans()
K_model = KModule.fit(Df)
K_result = K_model.predict(Df)
centers = K_model.cluster_centers_
K_model.labels_
plt.scatter (x1,x2, c = K_model.labels_, cmap = 'rainbow' )
Can anyone help?

No idea what you mean by "explore cluster by cluster".
If you don't specify the number of clusters, by default it is 8, so if you start with 3 like the code below, you can separate them. Also you need to set it as categoric, the cluster, so it will not be colored on a continuous scale:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
Df = pd.read_csv('Kdata.csv')
from sklearn.cluster import KMeans
KModule = KMeans(n_clusters=3)
K_model = KModule.fit(Df)
K_result = K_model.predict(Df)
Df['cluster'] = pd.Categorical(K_model.labels_)
sns.scatterplot("V1","V2",data=Df,hue='cluster',cmap = 'rainbow' )
Df.plot.scatter("V1","V2",c='cluster',cmap = 'rainbow')

Related

Plot in Seaborn 2 probability denisty functions on one figure

I'm trying to plot two probability density functions on one figure(so that they overlap).
import matplotlib.pyplot as plt
import numpy
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
data = [0,0,1,2,2,2,2,1,2,3,0,5,4,5,4,6,2,2,5,4,6,3,2,5,4,3,7,-1,0]
scaler = MinMaxScaler()
df = pd.DataFrame(data, columns=['Numbers'])
X = numpy.asarray(data)
X=X.reshape(-1,1)
standardized_data = scaler.fit_transform(X)
normal_data = np.random.normal(loc=0.0, scale=1.0, size=len(df))
sns.displot(normal_data, kind='kde')
sns.displot(standardized_data, kind='kde')
plt.show()
sns.displot(
{"normal": normal_data, "standardized": standardized_data.squeeze()},
kind='kde'
)

Create legends in scatter plt

I create a mask of my dataset for plotting only No Animals materials, and when I draw this mask I have problems with the legends, because only the first material defines me and I don't know how to add the other 2 materials.
import numpy as np
import umap
import umap.plot
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display,HTML
import cufflinks as cf
cf.set_config_file(sharing='public',theme='ggplot',offline=True)
import seaborn as sns
palpations = np.load('big_matrix_16384.npz',allow_pickle=True)
X = palpations['arr_0']
embedding = umap.UMAP(n_neighbors=50,
min_dist=0.2,
metric='correlation').fit(X)
emb = embedding.transform(X)
mask_1 = Data["Tipo"]=="Animal"
emb_tipo_1 = emb[mask_1]
cmap = plt.cm.Spectral
c =[sns.color_palette("Set2")[x] for x in data_tipo_1.Material.map({"bone":0, "cartilage":1, "liver_raw_piece1":2})]
plt.scatter(emb_tipo_1[:,0],
emb_tipo_1[:,1],
c=c,
label=np.unique(data_tipo_1.Material),s=10)
plt.gca().set_aspect("equal","datalim")
plt.title("UMAP muestras Animales.")
plt.legend()
enter image description here

Overlay a line plot around diffrent region in Seaborn or Matplotlib

I would like to plot a line plot and make different overlay based on condition as illustrated below.
May I know how, or if possible, please kindly redirect me to right material on achieving the intended objective.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(0)
rng = np.random.default_rng(2)
mlist=[]
for _ in range(4):
m=np.random.rand(4).tolist()
n=rng.integers(0, 6, size=(1)).tolist()*4
df = pd.DataFrame(zip(m,n), columns=['yval','type'])
mlist.append(df)
df=pd.concat(mlist).reset_index(drop=True).reset_index()
sns.lineplot(data=df, x="index", y="yval")
plt.show()
Suggestion using Matplotlib or Seaborn, or any other package are welcome
The filling of the section was achieved using axvspan. I also used text for annotations.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(0)
rng = np.random.default_rng(2)
mlist=[]
for _ in range(4):
m=np.random.rand(4).tolist()
n=rng.integers(0, 6, size=(1)).tolist()*4
df = pd.DataFrame(zip(m,n), columns=['yval','type'])
mlist.append(df)
df=pd.concat(mlist).reset_index(drop=True).reset_index()
g = sns.lineplot(data=df, x="index", y="yval")
overlay = {0:'m',1:'gray',5:'r'}
for i in np.arange(0,len(df),4):
tmp = df.iloc[i:i+4, :]
v = overlay.get(tmp.type.unique()[0])
g.axvspan(min(tmp.index), max(tmp.index)+1, color=v, alpha=0.3)
g.text(((min(tmp.index)+max(tmp.index)+1) / 2)-1, 0.1,'type {}'.format(tmp.type.unique()[0]), fontsize=12)
plt.show()
Using Matplotlib add_patch and text
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as mcolors
# nn=mcolors.CSS4_COLORS
all_colors=list(mcolors.TABLEAU_COLORS.keys())
b=1
np.random.seed(0)
rng = np.random.default_rng(2)
mlist=[]
for _ in range(4):
m=np.random.rand(4).tolist()
n=rng.integers(0, 6, size=(1)).tolist()*4
df = pd.DataFrame(zip(m,n), columns=['yval','type'])
mlist.append(df)
df=pd.concat(mlist).reset_index(drop=True).reset_index()
# df.to_feather('test.feather')
# df=pd.read_feather('test.feather')
df['C'] = df['type'].diff()
df['C']=df['C'].fillna(10)
nb=df.type[(df['C'] != 0)].to_frame().reset_index()
unique_val=nb['type'].drop_duplicates().sort_values().tolist()
ngroup_type=dict(zip(unique_val,[f'type {idx}' for idx in unique_val]))
nb['ngroup']=nb["type"].map(ngroup_type)
color_group=all_colors[:len(unique_val)]
res = dict(zip(unique_val, color_group))
nb["color"] = nb["type"].map(res)
starting_point=nb["index"].values.tolist()
mcolor=nb["color"].values.tolist()
group_type=nb["ngroup"].values.tolist()
nspace=4
nheight=1
fg=sns.lineplot(data=df, x="index", y="yval")
for ncolor,spoint,gtype in zip(mcolor,starting_point,group_type):
fg.axes.add_patch(patches.Rectangle((spoint, 0),
nspace,nheight,edgecolor = 'blue',
facecolor = ncolor,fill=True,alpha=0.1,ls=':') )
fg.axes.text(spoint+1.5, 0.1, gtype , size=10,
va="baseline", ha="left", multialignment="left")
plt.show()

Plot RidgeCV coefficients as a function of the regularization

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import RidgeCV
tips = sns.load_dataset('tips')
X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time'])
y = tips['tip']
alphas = 10**np.linspace(10,-2,100)*0.5
ridge_clf = RidgeCV(alphas=alphas,scoring='r2').fit(X, y)
ridge_clf.score(X, y)
I wanted to plot the following graph for RidgeCV. I don't see any option to do that like GridSearhCV. I appreciate your suggestions!
There is no indication what the colors stand for. I assume they stand for features and we investigate the size of each feature weight as function of alpha. Here is my solution:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV
tips = sns.load_dataset('tips')
X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time'])
y = tips['tip']
alphas = 10**np.linspace(10,-2,100)*0.5
w = list()
for a in alphas:
ridge_clf = RidgeCV(alphas=[a],cv=10).fit(X, y)
w.append(ridge_clf.coef_)
w = np.array(w)
plt.semilogx(alphas,w)
plt.title('Ridge coefficients as function of the regularization')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.legend(X.keys())
Output:
Since you only have two features in X there are only two lines.
Here is the code for generating the plot that you had posted.
Firstly, we need to understand that RidgeCV would not return the coef for each alpha value that we had fed in the alphas param.
The motivation behind having the RidgeCV is that it will try for different alpha values mentioned in alphas param, then based on cross validation scoring, it will return the best alpha along with the fitted model.
Hence, the only way to get the coef for each alpha value using cv is iterate through RidgeCV using each alpha value.
Example:
# Author: Fabian Pedregosa -- <fabian.pedregosa#inria.fr>
# License: BSD 3 clause
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)
coefs = []
for a in alphas:
ridge = linear_model.RidgeCV(alphas=[a], fit_intercept=False, cv=3)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# #############################################################################
# Display results
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('RidgeCV coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

Changing the size of the heatmap specifically in a seaborn clustermap?

I'm making a clustered heatmap in seaborn as follows
import numpy as np
import seaborn as sns
np.random.seed(2)
data = np.random.randn(100, 10)
sns.clustermap(data)
but the rows are squished:
but if I pass a size to the clustermap function then it looks terrible
is there a way to only increase the size of the heatmap part? So that the row names can be read, but not stretch out the cluster portions.
As #mwaskom commented, I was able to use ax_heatmap.set_position along with the get_position function to achieve the correct result.
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(2)
data = np.random.randn(100, 10)
cm = sns.clustermap(data)
hm = cm.ax_heatmap.get_position()
plt.setp(cm.ax_heatmap.yaxis.get_majorticklabels(), fontsize=6)
cm.ax_heatmap.set_position([hm.x0, hm.y0, hm.width*0.25, hm.height])
col = cm.ax_col_dendrogram.get_position()
cm.ax_col_dendrogram.set_position([col.x0, col.y0, col.width*0.25, col.height*0.5])
This can be done by passing the value of the dendrogram ratio in the kw arguments
import numpy as np
import seaborn as sns
np.random.seed(2)
data = np.random.randn(100, 10)
sns.clustermap(data,figsize=(12,30),dendrogram_ratio=0.02,cmap='RdBu')

Categories

Resources