Density clustering around a separate point - Python - python
I'm aiming to cluster xy points based on their proximity. Specifically, grouping points that are positioned closely to each other. I'm also hoping to use a separate reference point to cluster the data from.
Note: I have multiple sets of data that need to be clustered independently. For example using below, each unique value in Item signifies a different set of data. I could have multiple unique sets of data that all vary in sparsity. Therefore, any technique that passes a predetermined number of clusters isn't realistic as I'll have to manually check the fit and adjust the appropriate parameter every time.
As such, the best method thus far has been some form of density clustering (DBSCAN, OPTICS).
However, while I'm clustering points that are closely together, I'm hoping to pass some cut-off to keep the intended cluster spherical. On the other hand, I don't want to reduce the reachable area too much as I'm missing points that are close to the reference point and the core points but a small gap discards points that I'm hoping to include.
The following displays the dilemma below. Item 1 represents how the reachable should be lower to ensure the clustered points around the reference pint is spherical. While Item 2 shows how the reachable area needs to be higher to allow for points that are within the dense area to be included.
I'm hoping I can adjust a parameter or include a separate feature rather than force it. Because the dense area around the reference point can vary I'm reluctant to force every point outside a specific radius to be excluded.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
import seaborn as sns
from sklearn.cluster import OPTICS
fig, ax = plt.subplots(figsize = (6,6))
ax.grid(False)
df = pd.DataFrame({
'Item' : [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2],
'x' : [-4.0,-1.0,0.5,0.0,0.0,2.0,3.0,5.0,10.0,-2.0,2.0,5.0,7.5,15.0,0.0,-22.0,-20.0,-20.0,-6.5,20.5,0.0,20.0,-20.0,-15.0,20.0,-15.0,-10.0,-2.0,0.0,3.0,-3.0,-7.0,-7.5,-9.0,-4.0,1.5,-1.0,-5.0,-4.5,-3.7,15.0,-20.0,-22.0,-20.0,-20.0,-12.0,20.5,6.0,20.0,-20.0,-15.0,20.0,-15.0,-10.0],
'y' : [0.0,1.0,-0.5,0.5,-0.5,0.0,1.0,0.0,0.0,-2.0,-2.0,-7.0,-0.5,-10.5,-7.5,0.0,16.0,-15.0,5.0,13.5,3.0,-20.0,2.0,-17.5,-15,19.0,20.0,4.0,-2.0,0.0,0.0,2.5,2.0,-1.5,5.0,0.0,3.5,2.0,-5.5,-6.5,-10.5,-20.5,0.0,16.0,-15.0,5.0,13.5,6.0,-20.0,2.0,-17.5,-15,19.0,20.0],
'X_Ref' : [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0],
'Y_Ref' : [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0],
})
# not spherical
df = df[df['Item'] == 1]
# spherical but reachable area too small
#df = df[df['Item'] == 2]
df['distance'] = np.sqrt((df['X_Ref'] - df['x'])**2 + (df['Y_Ref'] - df['y'])**2)
Y_sklearn = df[['x','y']].values
ax.scatter(df['x'], df['y'], marker = 'o', s = 5)
ax.scatter(df['X_Ref'], df['Y_Ref'], c = 'w', edgecolor = 'k', marker = 'o', s = 7.5, zorder = 2)
#clusterer = DBSCAN(eps = 7.5, min_samples = 3)
#labels_clusters = clusterer.fit_predict(Y_sklearn)
clusterer = OPTICS(min_samples = 2, xi = 0.25, min_cluster_size = 0.25, max_eps = 5)
clusterer.fit(Y_sklearn)
labels_clusters = clusterer.fit_predict(Y_sklearn)
#Add cluster labels as a new column to original DataFrame.
df['cluster'] = labels_clusters
df['cluster'] = df['cluster'].astype('category')
sns.scatterplot(data = df,
x = 'x',
y = 'y',
hue = 'cluster',
ax = ax,
legend = 'full',
)
Item 1: points to the right of radius should be excluded from core points
Item 2: points within radius should be included in core points
I believe we could reformulate the problem. I am not sure the clustering approach is the best.
By clustering using distance
""""
https://stackoverflow.com/questions/66099958/density-clustering-around-a-separate-point-python
"""
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
import seaborn as sns
from sklearn.cluster import OPTICS
from sklearn.cluster import MiniBatchKMeans, KMeans
import matplotlib.pyplot as plt
# not spherical
df = pd.DataFrame({
'x' : [-4.0,-1.0,0.5,0.0,0.0,2.0,3.0,5.0,12.0,-2.0,2.0,8.0,8.5,15.0,-20.0,-22.0,-20.0,-20.0,-10.0,20.5,0.0,20.0,-20.0,-15.0,20.0,-15.0,-10.0],
'y' : [0.0,1.0,-0.5,0.5,-0.5,0.0,1.0,0.0,0.0,-2.0,-2.0,-8.0,-0.5,-10.5,-20.5,0.0,16.0,-15.0,5.0,13.5,3.0,-20.0,2.0,-17.5,-15,19.0,20.0],
'X_Ref' : [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],
'Y_Ref' : [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],
})
# spherical but reachable area too small
df1 = pd.DataFrame({
'x' : [-2.0,0.0,2.0,-3.0,-7.0,-7.5,-9.0,-4.0,1.5,-1.0,-5.0,-4.5,-3.7,15.0,-20.0,-22.0,-20.0,-20.0,-15.0,20.5,8.0,20.0,-20.0,-15.0,20.0,-15.0,-10.0],
'y' : [4.0,-2.0,0.0,0.0,2.5,2.0,-2.0,5.0,0.0,3.5,2.0,-5.5,-6.5,-10.5,-20.5,0.0,16.0,-15.0,5.0,13.5,5.0,-20.0,2.0,-17.5,-15,19.0,20.0],
'X_Ref' : [-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0],
'Y_Ref' : [-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0],
})
#Distance calculations
df['distance'] = np.sqrt((df['X_Ref'] - df['x'])**2 + (df['Y_Ref'] - df['y'])**2)
def distance_func(df):
return np.sqrt((df['X_Ref'] - df['x']) ** 2 + (df['Y_Ref'] - df['y']) ** 2)
df1['distance'] = distance_func(df1)
# Change this for the graphs
df = df1.copy()
Y_sklearn = df['distance'].values.reshape(-1, 1)
fig, ax = plt.subplots(figsize = (6,6))
ax.grid(False)
ax.scatter(df['x'], df['y'], marker = 'o', s = 5)
ax.scatter(df['X_Ref'], df['Y_Ref'], c = 'w', edgecolor = 'k', marker = 'o', s = 7.5, zorder = 2)
clusterer = KMeans(init='k-means++', n_clusters=2, n_init=10)
clusterer.fit(Y_sklearn)
labels_clusters = clusterer.fit_predict(Y_sklearn)
#Add cluster labels as a new column to original DataFrame.
df['cluster'] = labels_clusters
df['cluster'] = df['cluster'].astype('category')
sns.scatterplot(data = df,
x = 'x',
y = 'y',
hue = 'cluster',
ax = ax,
legend = 'full',
)
For df:
For df1:
By using marginal increase of area
As mentioned earlier I believe the problem could be reformulate using the idea of marginal area. Each point we add every time will increase the are considered in different ways.
In other words, use the elbow method for each point.
For area calculation I will just proxy be distance to the power of two.
Code:
""""
https://stackoverflow.com/questions/66099958/density-clustering-around-a-separate-point-python
"""
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
import seaborn as sns
from sklearn.cluster import OPTICS
from sklearn.cluster import MiniBatchKMeans, KMeans
import matplotlib.pyplot as plt
# not spherical
df = pd.DataFrame({
'x' : [-4.0,-1.0,0.5,0.0,0.0,2.0,3.0,5.0,12.0,-2.0,2.0,8.0,8.5,15.0,-20.0,-22.0,-20.0,-20.0,-10.0,20.5,0.0,20.0,-20.0,-15.0,20.0,-15.0,-10.0],
'y' : [0.0,1.0,-0.5,0.5,-0.5,0.0,1.0,0.0,0.0,-2.0,-2.0,-8.0,-0.5,-10.5,-20.5,0.0,16.0,-15.0,5.0,13.5,3.0,-20.0,2.0,-17.5,-15,19.0,20.0],
'X_Ref' : [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],
'Y_Ref' : [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],
})
# spherical but reachable area too small
df1 = pd.DataFrame({
'x' : [-2.0,0.0,2.0,-3.0,-7.0,-7.5,-9.0,-4.0,1.5,-1.0,-5.0,-4.5,-3.7,15.0,-20.0,-22.0,-20.0,-20.0,-15.0,20.5,8.0,20.0,-20.0,-15.0,20.0,-15.0,-10.0],
'y' : [4.0,-2.0,0.0,0.0,2.5,2.0,-2.0,5.0,0.0,3.5,2.0,-5.5,-6.5,-10.5,-20.5,0.0,16.0,-15.0,5.0,13.5,5.0,-20.0,2.0,-17.5,-15,19.0,20.0],
'X_Ref' : [-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0],
'Y_Ref' : [-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0],
})
df['distance'] = np.sqrt((df['X_Ref'] - df['x'])**2 + (df['Y_Ref'] - df['y'])**2)
def distance_func(df):
return np.sqrt((df['X_Ref'] - df['x']) ** 2 + (df['Y_Ref'] - df['y']) ** 2)
df1['distance'] = distance_func(df1)
# To shiwtch from one dataset to another.
#df=df1.copy()
df['distance_2'] = df['distance']**2
df.sort_values('distance',inplace=True)
#pd.DataFrame(df['marginal_change'].values).plot()
aux = pd.DataFrame(df['distance_2'].values, columns=['distance ** 2'])
aux.plot()
fig, ax = plt.subplots(figsize = (6,6))
ax.grid(False)
ax.scatter(df['x'], df['y'], marker = 'o', s = 5)
ax.scatter(df['X_Ref'], df['Y_Ref'], c = 'w', edgecolor = 'k', marker = 'o', s = 7.5, zorder = 2)
selected_top=10
labels_clusters = np.zeros(df.shape[0])
labels_clusters[0:selected_top] =1
#Add cluster labels as a new column to original DataFrame.
df['cluster'] = labels_clusters
df['cluster'] = df['cluster'].astype('category')
sns.scatterplot(data = df,
x = 'x',
y = 'y',
hue = 'cluster',
ax = ax,
legend = 'full',
)
For df:
Scree plot
From the scree plot you can see were the number of points is becoming too much. I will say the selection of 10 points could be good. The selection is based on the Elbow method.
Final plot:
For df1:
Scree plot:
Following Elbow method criteria 13 points could be the optimal.
Final plot:
Related
Place ellipsis on seaborn catplot
I have a seaborn.catplot that looks like this: What I am trying to do is highlight differences in the graph with the following rules: If A-B > 4, color it green If A-B < -1, color it red If A-B = <2= and >=0, color it blue I am looking to produce something akin to the below image: I have an MRE here: # Stack Overflow Example import numpy as np, pandas as pd, seaborn as sns from random import choice from string import ascii_lowercase, digits chars = ascii_lowercase + digits lst = [''.join(choice(chars) for _ in range(2)) for _ in range(100)] np.random.seed(8) t = pd.DataFrame( { 'Key': [''.join(choice(chars) for _ in range(2)) for _ in range(5)]*2, 'Value': np.random.uniform(low=1, high=10, size=(10,)), 'Type': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'] } ) ax = sns.catplot(data=t, x='Value', y='Key', hue='Type', palette="dark").set(title="Stack Overflow Help Me") plt.show() I believe an ellipsis will need to be plotted around the points of interest, and I have looked into some questions: Creating a Confidence Ellipses in a sccatterplot using matplotlib plot ellipse in a seaborn scatter plot But none seem to be doing this with catplot in particular, or with customizing their color and with rules. How can I achieve the desired result with my toy example?
You could create ellipses around the midpoint of A and B, using the distance between A and B, increased by some padding, as width. The height should be a bit smaller than 1. To get a full outline and transparent inner color, to_rgba() can be used. Setting the zorder to a low number puts the ellips behind the scatter points. sns.scatterplot is an axes-level equivalent for sns.catplot, and is easier to work with when there is only one subplot. Making the Key column of type pd.Categorical gives a fixed relation between y-position and label. import matplotlib.pyplot as plt from matplotlib.patches import Ellipse from matplotlib.colors import to_rgba import seaborn as sns import pandas as pd import numpy as np from string import ascii_lowercase, digits chars = ascii_lowercase + digits num = 9 df = pd.DataFrame({'Key': [''.join(np.random.choice([*chars], 2)) for _ in range(num)] * 2, 'Value': np.random.uniform(low=1, high=10, size=2 * num), 'Type': np.repeat(['A', 'B'], num)}) df['Key'] = pd.Categorical(df['Key']) # make the key categorical for a consistent ordering sns.set_style('white') ax = sns.scatterplot(data=df, x='Value', y='Key', hue='Type', palette="dark") df_grouped = df.groupby(['Key', 'Type'])['Value'].mean().unstack() for y_pos, y_label in enumerate(df['Key'].cat.categories): A = df_grouped.loc[y_label, 'A'] B = df_grouped.loc[y_label, 'B'] dif = A - B color = 'limegreen' if dif > 4 else 'crimson' if dif < -1 else 'dodgerblue' if 0 <= dif < 2 else None if color is not None: ell = Ellipse(xy=((A + B) / 2, y_pos), width=abs(dif) + 0.8, height=0.8, fc=to_rgba(color, 0.1), lw=1, ec=color, zorder=0) ax.add_patch(ell) plt.tight_layout() plt.show()
K-Means R vs K-Means Python different cluster values generating different bar Graphs
Below are 2 sets of code that do the same thing one in Python the other in R. They both graph the Kmeans the same with respect to PCA but once I do the bar chart at the end using the cluster Center the Graphs are totally different. I believe there is something wrong about the Kmeans and the cluster calculation in python. The original code was provided in R. I am trying to see why the bar chart in python does not match are I believe its the centers. Please review and provide some feed back. Please use the link below to download the data set I used to generate these graphs. https://www.dropbox.com/s/fhnxxrjl07y0h2c/TableStats2.csv?dl=0 R Code ## Retrive Libraries needed for script library("ggplot2") library("reshape2") pcp <- read.csv(file='E:\\ProgramData\\R\\Code\\TableStats2.csv') #Label each row with table Name to Plot names on chart. data <- pcp rownames(data) <- data[, 1] #Gather all the data and leave out Table Names data <- data[, -1] data <- data[, -1] #Create The PCA (Principle Component Analysis) data <- scale(data) pca <- prcomp(data) plot.data <- data.frame(pca$x[, 1:2]) set.seed(2121) clusters <- kmeans(data, 6) plot.data$clusters <- factor(clusters$cluster) g <- ggplot(plot.data, aes(x = PC1, y = PC2, colour = clusters)) + geom_point(size = 3.5) + geom_text(label = rownames(data), colour = "darkgrey", hjust = .7) + theme_bw() behaviours <- data.frame(clusters$centers) behaviours$cluster <- 1:6 behavious <- melt(behaviours, "cluster") g2 <- ggplot(behavious, aes(x = variable, y = value)) + geom_bar(stat = "identity", position = 'identity', fill = "steelblue") + facet_wrap(~cluster) + theme_grey() + theme(axis.text.x = element_text(angle = 90)) python code import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from matplotlib import pyplot as plt from plotnine import ggplot, aes, geom_line, geom_bar, facet_wrap, theme_grey, theme, element_text TableStats = pd.read_csv(r'E:\ProgramData\R\Code\TableStats2.csv') sc = StandardScaler() pca = PCA() tables = TableStats.iloc[:,0] y = tables features = ['Range Scans', 'Singleton Lookups', 'Row Locks', 'Row Lock Waits (ms)','Page Locks', 'Page Lock Waits (ms)', 'Page IO Latch Wait (ms)'] # Separating out the features x = TableStats.loc[:, features].values x = sc.fit_transform(x) dpca = pca.fit_transform(x) x1 = dpca[:,0] y1 = dpca[:,1] plt.figure(figsize=(20,11)) plot = plt.scatter(x1,y1, c=y.index.tolist()) for i, label in enumerate(y): #print(label) plt.annotate(label,(x1[i], y1[i])) plt.show() df = pd.DataFrame(dpca,columns = ['Range Scans', 'Singleton Lookups', 'Row Locks', 'Row Lock Waits (ms)','Page Locks', 'Page Lock Waits (ms)', 'Page IO Latch Wait (ms)']) clusters = KMeans(n_clusters=6,init='k-means++', random_state=2121).fit(df) df['Cluster'] = clusters.labels_ df['Cluster Centroid D1'] = df['Cluster'].apply(lambda label: clusters.cluster_centers_[label][0]) df['Cluster Centroid D2'] = df['Cluster'].apply(lambda label: clusters.cluster_centers_[label][1]) df['tables'] = tables #print Table Names plt.figure(figsize=(20, 11)) ax = sns.scatterplot(data=df, x=x1, y=y1, hue='Cluster', s=200, palette='coolwarm', legend=True) ax = sns.scatterplot(data=df, x="Cluster Centroid D1", y="Cluster Centroid D2", hue='Cluster', s=1000, palette='coolwarm', legend=False, alpha=0.1) for line in range(0,df.shape[0]): ax.text(x1[line]+0.05, y1[line],TableStats['Object Name'][line], horizontalalignment='left',size='medium', color='black',weight='semibold') plt.legend(loc='upper right', title='Cluster') ax.set_title("Clustered Points", fontsize='xx-large', y=1.05); plt.show() # here is where the R and Python graphs are different because the cluster centers dont match behaviours = pd.DataFrame(clusters.cluster_centers_) behaviours.columns = clusters.feature_names_in_ behaviours['cluster'] = [1,2,3,4,5,6] b2 = pd.melt(behaviours, id_vars = "cluster",value_name="value") (ggplot(b2, aes(x = 'variable', y = 'value')) + geom_bar(stat = "identity", position = 'identity', fill = "steelblue") + facet_wrap('~cluster') + theme_grey() + theme(axis_text_x = element_text(rotation = 90, hjust=1), figure_size=(20,8)) )
Update now I have this working in R and Python Looking at this specific problem, check the outputs of the PCA - they're different, so k-means won't be the same. The reason is in your R code - you repeat the line data <- data[, -1], dropping the table names and the first column of the data. Remove the extra line, and the clusters look the same. General comments on R and Python implementation of kmeans In general, it looks like R and python use different algorithms by default. R uses "Hartigan-Wong" by default, and Python's scikit-learn probably uses "elkan". Set algorithm='Lloyd' in R and algorithm='full' in Python (which I believe currently will run Lloyd's algorithm as well) to ensure they're at least attempting the same thing. You also have different initialisation methods - R is random and for Python you are using 'k-means++'. Set init='random' in Python to make these match. They have different numbers of max iteartions - R defaults to 10, Python to 300. Set these as equal also. Finally, you won't see any random variation in your python script if you set the random_state in the Python KMeans call (and check you haven't set.seed in R also). Once you've done this, try running both multiple times, and compare the distributions of values. Hopefully you'll see overlap between the two implementations. Check out the docs for the R implementation and the scikit-learn implementation. And a final point here - kmeans is unsupervised. The class labels have no absolute meaning. Run the code multiple times, and class 0 will not always be assigned to the same data points, even if data points are grouped identically. Here's a reproducible example of this: import pandas as pd from sklearn import cluster, datasets from matplotlib import pyplot as plt import seaborn as sns from sklearn.cluster import KMeans X, y = datasets.make_blobs(100,2,centers=6) df = pd.DataFrame(X) random_states = list(range(0,60,10)) fig, ax = plt.subplots(3,2, figsize=(20,16)) for i, r in enumerate(random_states): clusters = KMeans(n_clusters=6,init='k-means++', random_state=r).fit(X) df = (df .assign(**{ 'Cluster': clusters.labels_, 'Cluster Centroid D1': lambda x: x['Cluster'].apply(lambda label: clusters.cluster_centers_[label][0]), 'Cluster Centroid D2': lambda x: x['Cluster'].apply(lambda label: clusters.cluster_centers_[label][1]), }) ) row = i//2 col = i - row*2 sns.scatterplot(data=df, x=0, y=1, hue='Cluster', s=200, palette='coolwarm', legend=True, ax=ax[row,col]) sns.scatterplot(data=df, x="Cluster Centroid D1", y="Cluster Centroid D2", hue='Cluster', s=1000, palette='coolwarm', legend=False, alpha=0.1, ax=ax[row,col]) Here's a version with your data: import pandas as pd from matplotlib import pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler TableStats = pd.read_csv('TableStats2.csv') sc = StandardScaler() pca = PCA() tables = TableStats.iloc[:,0] y = tables features = ['Range Scans', 'Singleton Lookups', 'Row Locks', 'Row Lock Waits (ms)', 'Page Locks', 'Page Lock Waits (ms)', 'Page IO Latch Wait (ms)'] # Separating out the features x = TableStats.loc[:, features].values x = sc.fit_transform(x) dpca = pca.fit_transform(x) x1 = dpca[:,0] y1 = dpca[:,1] random_states = [1,2,3,4,5,6] for r in random_states: df = pd.DataFrame(dpca,columns = ['Range Scans', 'Singleton Lookups', 'Row Locks', 'Row Lock Waits (ms)', 'Page Locks', 'Page Lock Waits (ms)', 'Page IO Latch Wait (ms)']) clusters = KMeans(n_clusters=6,init='k-means++', random_state=r).fit(df) df = (df .assign(**{ 'Cluster': clusters.labels_, 'Cluster Centroid D1': lambda x: x['Cluster'].apply(lambda label: clusters.cluster_centers_[label][0]), 'Cluster Centroid D2': lambda x: x['Cluster'].apply(lambda label: clusters.cluster_centers_[label][1]), }) ) plt.figure(figsize=(20, 11)) ax = sns.scatterplot(data=df, x=x1, y=y1, hue='Cluster', s=200, palette='coolwarm', legend=True) ax = sns.scatterplot(data=df, x="Cluster Centroid D1", y="Cluster Centroid D2", hue='Cluster', s=1000, palette='coolwarm', legend=False, alpha=0.1) plt.legend(loc='upper right', title='Cluster') ax.set_title("Clustered Points", fontsize='xx-large', y=1.05); plt.show()
How to have gradient colors and marker sizes for seaborn pairplot
In Have gradient colours in sns.pairplot for one column of dataframe so that I can see which datapoints are connected to each other very good answers were given how to solve the challenge to recognize which data points are related to the same data points in other sub plots. To have a self containing question, I state here my requirement (which is somehow an extension of the linked question): I would like to see the interdependence of my data. For that I want to have a gradual color gradient for one column of my DataFrame (so that low numerical values of that column are e.g. yellow and high values are blue). For a second column of my data, I would like to have increasing marker sizes with increasing values of this column. These colors and marker sizes should be visible for all non diagonal subplots of my plot, based on the data points of a and b. The solution to the gradient color is given in the linked question. I put here both solutions that presently exist: import seaborn as sns import matplotlib.pyplot as plt import pandas as pd import numpy as np f, axes = plt.subplots(1, 1) np.random.seed(1) a = np.arange(0, 10, 0.1) def myFunc(x): myReturn = +10 + 10*x -x**2 + 1*np.random.random(x.shape[0]) return myReturn b = myFunc(a) c = a * np.sin(a) df = pd.DataFrame({'a': a, 'b': b, 'c': c}) if False: sns.pairplot( df, corner=True, diag_kws=dict(color=".6"), plot_kws=dict( hue=df.index, palette="blend:gold,dodgerblue", ), ) else: from matplotlib.colors import LinearSegmentedColormap cmap = LinearSegmentedColormap.from_list('blue-yellow', ['gold', 'lightblue', 'darkblue']) # plt.get_cmap('viridis_r') g = sns.pairplot(df, corner=True) for ax in g.axes.flat: if ax is not None and not ax in g.diag_axes: for collection in ax.collections: collection.set_cmap(cmap) collection.set_array(df['a']) plt.show() A (basic) solution for the increasing marker sizes would be (using simply matplotlib): import numpy as np import matplotlib.pyplot as plt # Fixing random state for reproducibility np.random.seed(19680801) N = 50 x = np.random.rand(N) y = np.random.rand(N) colors = np.random.rand(N) area = (30 * np.random.rand(N))**2 # 0 to 15 point radii plt.scatter(x, y, s=area, c=colors, alpha=0.5) plt.show() My question is: I could work on a manual solution to iterate over all columns of my DataFrame and build the sub plots by myself. Is there any more convenient (and probably more robust) way to do this?
You can modify the sizes and hue for the off-diagonal data easily by adding the parameters you'd use in Matplotlib to the plot_kws dictionary: sns.pairplot(df, corner=True, diag_kws=dict(color=".6"), plot_kws=dict( hue=df['a'], palette="blend:gold,dodgerblue", size = df['b'] ) )
Line color as a function of column values in pandas dataframe
I am trying to plot two columns of a pandas dataframe against each other, grouped by a values in a third column. The color of each line should be determined by that third column, i.e. one color per group. For example: import pandas as pd from matplotlib import pyplot as plt fig, ax = plt.subplots() df = pd.DataFrame({'x': [0.1,0.2,0.3,0.1,0.2,0.3,0.1,0.2,0.3],'y':[1,2,3,2,3,4,4,3,2], 'colors':[0.3,0.3,0.3,0.7,0.7,0.7,1.3,1.3,1.3]}) df.groupby('colors').plot('x','y',ax=ax) If I do it this way, I end up with three different lines plotting x against y, with each line a different color. I now want to determine the color by the values in 'colors'. How do I do this using a gradient colormap?
Looks like seaborn is applying the color intensity automatically based on the value in hue.. import pandas as pd from matplotlib import pyplot as plt df = pd.DataFrame({'x': [0.1,0.2,0.3,0.1,0.2,0.3,0.1,0.2,0.3,0.1,0.2,0.3],'y':[1,2,3,2,3,4,4,3,2,3,4,2], 'colors':[0.3,0.3,0.3,0.7,0.7,0.7,1.3,1.3,1.3,1.5,1.5,1.5]}) import seaborn as sns sns.lineplot(data = df, x = 'x', y = 'y', hue = 'colors') Gives: you can change the colors by adding palette argument as below: import seaborn as sns sns.lineplot(data = df, x = 'x', y = 'y', hue = 'colors', palette = 'mako') #more combinations : viridis, mako, flare, etc. gives: Edit (for colormap): based on answers at Make seaborn show a colorbar instead of a legend when using hue in a bar plot? import seaborn as sns fig = sns.lineplot(data = df, x = 'x', y = 'y', hue = 'colors', palette = 'mako') norm = plt.Normalize(vmin = df['colors'].min(), vmax = df['colors'].max()) sm = plt.cm.ScalarMappable(cmap="mako", norm = norm) fig.figure.colorbar(sm) fig.get_legend().remove() plt.show() gives.. Hope that helps..
Complementing to Prateek's very good answer, once you have assigned the colors based on the intensity of the palette you choose (for example Mako): plots = sns.lineplot(data = df, x = 'x', y = 'y', hue = 'colors',palette='mako') You can add a colorbar with matplotlib's function plt.colorbar() and assign the palette you used: sm = plt.cm.ScalarMappable(cmap='mako') plt.colorbar(sm) After plt.show(), we get the combined output:
Plotting profile hitstograms in python
I am trying to make a profile plot for two columns of a pandas.DataFrame. I would not expect this to be in pandas directly but it seems there is nothing in matplotlib either. I have searched around and cannot find it in any package other than rootpy. Before I take the time to write this myself I thought I would ask if there was a small package that contained profile histograms, perhaps where they are known by a different name. If you don't know what I mean by "profile histogram" have a look at the ROOT implementation. http://root.cern.ch/root/html/TProfile.html
You can easily do it using scipy.stats.binned_statistic. import scipy.stats import numpy import matplotlib.pyplot as plt x = numpy.random.rand(10000) y = x + scipy.stats.norm(0, 0.2).rvs(10000) means_result = scipy.stats.binned_statistic(x, [y, y**2], bins=50, range=(0,1), statistic='mean') means, means2 = means_result.statistic standard_deviations = numpy.sqrt(means2 - means**2) bin_edges = means_result.bin_edges bin_centers = (bin_edges[:-1] + bin_edges[1:])/2. plt.errorbar(x=bin_centers, y=means, yerr=standard_deviations, linestyle='none', marker='.')
Use seaborn. Data as from #MaxNoe import numpy as np import seaborn as sns # just some random numbers to get started x = np.random.uniform(-2, 2, 10000) y = np.random.normal(x**2, np.abs(x) + 1) sns.regplot(x=x, y=y, x_bins=10, fit_reg=None) You can do much more (error bands are from bootstrap, you can change the estimator on the y-axis, add regression, ...)
While #Keith's answer seems to fit what you mean, it is quite a lot of code. I think this can be done much simpler, so one gets the key concepts and can adjust and build on top of it. Let me stress one thing: what ROOT is calling a ProfileHistogram is not a special kind of plot. It is an errorbar plot. Which can simply be done in matplotlib. It is a special kind of computation and that's not the task of a plotting library. This lies in the pandas realm, and pandas is great at stuff like this. It's symptomatical for ROOT as the giant monolithic pile it is to have an extra class for this. So what you want to do is: discretize in some variable x and for each bin, calculate something in another variable y. This can easily done using np.digitize together with the pandas groupy and aggregate methods. Putting it all together: import pandas as pd import matplotlib.pyplot as plt import numpy as np # just some random numbers to get startet x = np.random.uniform(-2, 2, 10000) y = np.random.normal(x**2, np.abs(x) + 1) df = pd.DataFrame({'x': x, 'y': y}) # calculate in which bin row belongs base on `x` # bins needs the bin edges, so this will give as 100 equally sized bins bins = np.linspace(-2, 2, 101) df['bin'] = np.digitize(x, bins=bins) bin_centers = 0.5 * (bins[:-1] + bins[1:]) bin_width = bins[1] - bins[0] # grouby bin, so we can calculate stuff binned = df.groupby('bin') # calculate mean and standard error of the mean for y in each bin result = binned['y'].agg(['mean', 'sem']) result['x'] = bin_centers result['xerr'] = bin_width / 2 # plot it result.plot( x='x', y='mean', xerr='xerr', yerr='sem', linestyle='none', capsize=0, color='black', ) plt.savefig('result.png', dpi=300) Just like ROOT ;)
I made a module myself for this functionality. import pandas as pd from pandas import Series, DataFrame import numpy as np import matplotlib.pyplot as plt def Profile(x,y,nbins,xmin,xmax,ax): df = DataFrame({'x' : x , 'y' : y}) binedges = xmin + ((xmax-xmin)/nbins) * np.arange(nbins+1) df['bin'] = np.digitize(df['x'],binedges) bincenters = xmin + ((xmax-xmin)/nbins)*np.arange(nbins) + ((xmax-xmin)/(2*nbins)) ProfileFrame = DataFrame({'bincenters' : bincenters, 'N' : df['bin'].value_counts(sort=False)},index=range(1,nbins+1)) bins = ProfileFrame.index.values for bin in bins: ProfileFrame.ix[bin,'ymean'] = df.ix[df['bin']==bin,'y'].mean() ProfileFrame.ix[bin,'yStandDev'] = df.ix[df['bin']==bin,'y'].std() ProfileFrame.ix[bin,'yMeanError'] = ProfileFrame.ix[bin,'yStandDev'] / np.sqrt(ProfileFrame.ix[bin,'N']) ax.errorbar(ProfileFrame['bincenters'], ProfileFrame['ymean'], yerr=ProfileFrame['yMeanError'], xerr=(xmax-xmin)/(2*nbins), fmt=None) return ax def Profile_Matrix(frame): #Much of this is stolen from https://github.com/pydata/pandas/blob/master/pandas/tools/plotting.py import pandas.core.common as com import pandas.tools.plotting as plots from pandas.compat import lrange from matplotlib.artist import setp range_padding=0.05 df = frame._get_numeric_data() n = df.columns.size fig, axes = plots._subplots(nrows=n, ncols=n, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = com.notnull(df) boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): common = (mask[a] & mask[b]).values nbins = 100 (xmin,xmax) = boundaries_list[i] ax = axes[i, j] Profile(df[a][common],df[b][common],nbins,xmin,xmax,ax) ax.set_xlabel('') ax.set_ylabel('') plots._label_axis(ax, kind='x', label=b, position='bottom', rotate=True) plots._label_axis(ax, kind='y', label=a, position='left') if j!= 0: ax.yaxis.set_visible(False) if i != n-1: ax.xaxis.set_visible(False) for ax in axes.flat: setp(ax.get_xticklabels(), fontsize=8) setp(ax.get_yticklabels(), fontsize=8) return axes
To my knowledge matplotlib doesn't still allow to directly produce profile histograms. You can instead give a look at Hippodraw, a package developed at SLAC, that can be used as a Python extension module. Here there is a Profile histogram example: http://www.slac.stanford.edu/grp/ek/hippodraw/datareps_root.html#datareps_profilehist