Stacked density plots with pandas and seaborn - python

I am trying to obtain the following plot from a pandas data frame.
I am not sure how to combine seaborn with pandas for that task.
This is the dataframe I want to use:
import pandas as pd
data = pd.DataFrame({'a': np.random.randn(1000) + 1,
'b': np.random.randn(1000),
'c': np.random.rand(1000) + 10},
columns=['a', 'b', 'c'])
data.a[data.a.sample(100).index] = np.NaN
data.b[data.b.sample(800).index] = np.NaN
Notice that the frequency will need to be normalized (height of the histogram), as the number of data points and distributions differ significantly and the distributions will have different 'y scales'.
data.plot.hist();
This is the code of seaborn that generates the figure I used in the beginning.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Create the data
rs = np.random.RandomState(1979)
x = rs.randn(150)
g = np.tile(list("ABC"), 50)
df = pd.DataFrame(dict(x=x, g=g))
m = df.g.map(ord)
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(df, row="g", hue="g", aspect=5, height=1, palette=pal)
# Draw the densities in a few steps
g.map(sns.kdeplot, "x", clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)
g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw=.2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, .3, label, fontweight="bold", color=color,
ha="left", va="center", transform=ax.transAxes)
g.map(label, "x")
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.0025)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

Here is a function to create a grid of kde plots ("joyplot") with one plot per dataframe column.
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
def joyplot_from_dataframe(data, cmap=None):
mi, ma = np.nanmin(data.values), np.nanmax(data.values)
minx = mi - (ma-mi)/5
maxx = ma + (ma-mi)/5
x = np.linspace(minx,maxx, 1000)
n = len(data.columns)
if not cmap:
cmap = plt.cm.get_cmap("Blues")
colors = cmap(np.linspace(.2,1,n))
fig, axes = plt.subplots(nrows = n, sharex=True)
for c, ax, color in zip(data.columns, axes, colors):
y = data[c].values
y = y[~np.isnan(y)]
kde = gaussian_kde(y)
ax.fill_between(x, kde(x), color=color)
ax.yaxis.set_visible(False)
for spine in ["left", "right", "top"]:
ax.spines[spine].set_visible(False)
ax.spines["bottom"].set_linewidth(2)
ax.spines["bottom"].set_color(color)
ax.margins(y=0)
ax.tick_params(bottom=False)
return fig, axes
Use it as
import pandas as pd
data = pd.DataFrame({'a': np.random.randn(1000) + 1,
'b': np.random.randn(1000),
'c': np.random.rand(1000) + 10},
columns=['a', 'b', 'c'])
data.a[data.a.sample(100).index] = np.NaN
data.b[data.b.sample(800).index] = np.NaN
joyplot_from_dataframe(data)
plt.show()

Related

Normalizing height / mode of kdeplot to be 1

I am using the FacetGrid example from seaborn [Overlapping densities (‘ridge plot’)]. However, instead of normalizing the integral of the kdeplot, I want to normalize the heights. Does anyone have an idea, how to realize it?
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Create the data
rs = np.random.RandomState(1979)
x = rs.randn(500)
g = np.tile(list("ABCDEFGHIJ"), 50)
df = pd.DataFrame(dict(x=x, g=g))
m = df.g.map(ord)
df["x"] += m
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(df, row="g", hue="g", aspect=15, height=.5, palette=pal)
# Draw the densities in a few steps
g.map(sns.kdeplot, "x",
bw_adjust=.5, clip_on=False,
fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw_adjust=.5)
# passing color=None to refline() uses the hue mapping
g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, .2, label, fontweight="bold", color=color,
ha="left", va="center", transform=ax.transAxes)
g.map(label, "x")
# Set the subplots to overlap
g.figure.subplots_adjust(hspace=-.25)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)
So far, I have done some search engine requests where I tried to find something comparable that has been performed for histplot from matplotlib. However, I have found only solutions for the normalization of the integral.
For just one kdeplot -
A method normalize() to normalize the values -
def normalize(arr, t_min, t_max):
norm_arr = []
diff = t_max - t_min
diff_arr = max(arr) - min(arr)
for i in arr:
temp = (((i - min(arr))*diff)/diff_arr) + t_min
norm_arr.append(temp)
return norm_arr
If fill=False
tips = sns.load_dataset("tips")
ax = sns.kdeplot(data=tips, x="total_bill")
line = ax.lines[0]
line.set_ydata(normalize(line.get_ydata(),0,1))
ax.set_ylim(0,1.05)
ax.autoscale_view()
If fill=True
tips = sns.load_dataset("tips")
ax = sns.kdeplot(data=tips, x="total_bill",fill=True)
path = ax.collections[0].get_paths()
ys = normalize(path[0].vertices[:, 1],0,1)
path[0].vertices[:, 1] = ys
ax.set_ylim(0,1.05)
ax.autoscale_view()
Now if you want to use a FacetGrid then, probably all your problems can be solved just by using sharey=True like -
g = sns.FacetGrid(df, row="g", hue="g", aspect=15, height=.5, palette=pal, sharey=True)
But still if you need to normalize then-
define a wrapper function -
def kdeplot(data, **kwargs):
ax = sns.kdeplot(data, **kwargs)
if 'fill' in kwargs.keys() and kwargs['fill']==True:
path = ax.collections[0].get_paths()
ys = normalize(path[0].vertices[:, 1],0,1)
path[0].vertices[:, 1] = ys
else:
line = ax.lines[0]
line.set_ydata(normalize(line.get_ydata(),0,1))
ax.set_ylim(0,1.05)
ax.autoscale_view()
then -
tips = sns.load_dataset("tips")
ax = kdeplot(data=tips, x="total_bill",fill=True)
ax = kdeplot(data=tips, x="total_bill",fill=False, lw=4)
Now you can just use kdeplot instead of sns.kdeplot -
g.map(kdeplot, "x",bw_adjust=.5, clip_on=False,
fill=True, alpha=1, linewidth=1.5)
g.map(kdeplot, "x", clip_on=False, color="w", lw=2, bw_adjust=.5)

how to add hatches to cells in seaborn.heatmap

I tried to visualize my data with seaborn.heatmap.
However, the problem I have is that when I print it out in grayscle, the image is hard to read.
I follow many similar questions but it didn't work.
Is there anyway to add hatches over the cells in seaborn.heatmap?
My code is as below:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
df = pd.read_csv("file.csv")
sns.heatmap(df, annot=False, fmt='.0f', square=True,
cmap="coolwarm", linewidths=1, cbar=False)
plt.show()
You could create a loop, dividing the values into e.g. 4 groups and assign a hatch pattern to each of them via pcolor applied to the subset.
Here is an example starting from random test data:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
column_names = [f'{c:.2f}' for c in np.arange(0, 1.5001, 0.05)]
row_names = ['Alkaid', 'Mizar', 'Alioth', 'Megrez', 'Phecda', 'Merak', 'Dubhe']
df = pd.DataFrame(np.random.normal(0.3, 1, (len(row_names), len(column_names))).cumsum(axis=1) + 5,
columns=column_names, index=row_names)
values = df.values
vmin = values.min()
vmax = values.max()
patterns = ['', 'oo', '////', 'XXX']
bounds = np.linspace(vmin, vmax, len(patterns) + 1)
bounds[-1] += 1
sns.set_style('white')
fig, ax = plt.subplots(figsize=(12, 5))
sns.heatmap(data=df, linewidths=1, square=True, cmap='coolwarm', linecolor='white', cbar=False, ax=ax)
x = np.arange(df.shape[1] + 1)
y = np.arange(df.shape[0] + 1)
handles = []
norm = plt.Normalize(vmin, vmax)
cmap = plt.get_cmap('coolwarm')
for pattern, b0, b1 in zip(patterns, bounds[:-1], bounds[1:]):
ax.pcolor(x, y, np.where((values >= b0) & (values < b1), values, np.nan), cmap=cmap, norm=norm,
hatch=pattern, ec='black', lw=1)
handles.append(plt.Rectangle((0, 0), 0, 0, color=cmap(norm((b0 + b1) / 2)), ec='black',
hatch=pattern, label=f'{b0:5.2f}-{b1:5.2f}'))
ax.hlines(y, 0, x.max(), color='w', lw=2)
ax.vlines(x, 0, y.max(), color='w', lw=2)
ax.legend(handles=handles, bbox_to_anchor=(1.01, 1.02), loc='upper left',
handlelength=2, handleheight=2, frameon=False)
plt.tight_layout()
plt.show()

Scatterplot legend not including all data [duplicate]

Starting from the following example:
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
for label in df['l']:
df.plot('n1','n2', kind='scatter', ax=ax, s=50, linewidth=0.1, label=label)
what I obtained is the following scatterplot:
I'm now trying to set a different color for each of the four points. I know that I can loop over a set of, for instance, 4 colors in a list like:
colorlist = ['b','r','c','y']
but since my real dataset comprise at least 20 different points, I was looking for a sort of "color generator" to loop within it.
The following method will create a list of colors as long as your dataframe, and then plot a point with a label with each color:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
colormap = cm.viridis
colorlist = [colors.rgb2hex(colormap(i)) for i in np.linspace(0, 0.9, len(df['l']))]
for i,c in enumerate(colorlist):
x = df['n1'][i]
y = df['n2'][i]
l = df['l'][i]
ax.scatter(x, y, label=l, s=50, linewidth=0.1, c=c)
ax.legend()
plt.show()
IIUC you can do it this way:
import matplotlib.pyplot as plt
from matplotlib import colors
import pandas as pd
colorlist = list(colors.ColorConverter.colors.keys())
fig, ax = plt.subplots()
[df.iloc[[i]].plot.scatter('n1', 'n2', ax=ax, s=50, label=l,
color=colorlist[i % len(colorlist)])
for i,l in enumerate(df.l)]
colorlist:
In [223]: colorlist
Out[223]: ['m', 'b', 'g', 'r', 'k', 'y', 'c', 'w']
PS colorlist[i % len(colorlist)] - should always remain in the list bounds
How about this,
Here is the source code,
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib import cm
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
#colors = ['b','r','c','y']
nrof_labels = len(df['l'])
colors = cm.rainbow(np.linspace(0, 1, nrof_labels)) # create a bunch of colors
for i, r in df.iterrows():
ax.plot(r['n1'], r['n2'], 'o', markersize=10, color=colors[i], linewidth=0.1, label=r['l'])
ax.set_xlim(0.5, 3.5)
ax.set_ylim(0.5, 3.5)
plt.legend(loc='best')
plt.show()
Additionally, if df[l] has repeated elements and if the colors have to be assigned accordingly:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
fig, ax = plt.subplots(figsize=(8,8))
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['b','b','c','d']})
l_unq = df['l'].unique()
colormap = cm.viridis
colorlist = [colors.rgb2hex(colormap(i)) for i in np.linspace(0, 0.9, len(l_unq))]
for i,c in enumerate(colorlist):
x = df[df.l==l_unq[i]].n1
y = df[df.l==l_unq[i]].n2
l = l_unq[i]
ax.scatter(x, y, label=l, s=50, linewidth=0.1, c=c)
ax.set_xlabel('n1')
ax.set_ylabel('n2')
ax.legend()
plt.show()

pandas - scatter plot with different color legend for each point

Starting from the following example:
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
for label in df['l']:
df.plot('n1','n2', kind='scatter', ax=ax, s=50, linewidth=0.1, label=label)
what I obtained is the following scatterplot:
I'm now trying to set a different color for each of the four points. I know that I can loop over a set of, for instance, 4 colors in a list like:
colorlist = ['b','r','c','y']
but since my real dataset comprise at least 20 different points, I was looking for a sort of "color generator" to loop within it.
The following method will create a list of colors as long as your dataframe, and then plot a point with a label with each color:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
colormap = cm.viridis
colorlist = [colors.rgb2hex(colormap(i)) for i in np.linspace(0, 0.9, len(df['l']))]
for i,c in enumerate(colorlist):
x = df['n1'][i]
y = df['n2'][i]
l = df['l'][i]
ax.scatter(x, y, label=l, s=50, linewidth=0.1, c=c)
ax.legend()
plt.show()
IIUC you can do it this way:
import matplotlib.pyplot as plt
from matplotlib import colors
import pandas as pd
colorlist = list(colors.ColorConverter.colors.keys())
fig, ax = plt.subplots()
[df.iloc[[i]].plot.scatter('n1', 'n2', ax=ax, s=50, label=l,
color=colorlist[i % len(colorlist)])
for i,l in enumerate(df.l)]
colorlist:
In [223]: colorlist
Out[223]: ['m', 'b', 'g', 'r', 'k', 'y', 'c', 'w']
PS colorlist[i % len(colorlist)] - should always remain in the list bounds
How about this,
Here is the source code,
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib import cm
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
#colors = ['b','r','c','y']
nrof_labels = len(df['l'])
colors = cm.rainbow(np.linspace(0, 1, nrof_labels)) # create a bunch of colors
for i, r in df.iterrows():
ax.plot(r['n1'], r['n2'], 'o', markersize=10, color=colors[i], linewidth=0.1, label=r['l'])
ax.set_xlim(0.5, 3.5)
ax.set_ylim(0.5, 3.5)
plt.legend(loc='best')
plt.show()
Additionally, if df[l] has repeated elements and if the colors have to be assigned accordingly:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
fig, ax = plt.subplots(figsize=(8,8))
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['b','b','c','d']})
l_unq = df['l'].unique()
colormap = cm.viridis
colorlist = [colors.rgb2hex(colormap(i)) for i in np.linspace(0, 0.9, len(l_unq))]
for i,c in enumerate(colorlist):
x = df[df.l==l_unq[i]].n1
y = df[df.l==l_unq[i]].n2
l = l_unq[i]
ax.scatter(x, y, label=l, s=50, linewidth=0.1, c=c)
ax.set_xlabel('n1')
ax.set_ylabel('n2')
ax.legend()
plt.show()

frequency trail in matplotlib

I'm looking into outliers detection. Brendan Gregg has a really nice article and I'm especially intrigued by his visualizations. One of the methods he uses are frequency trails.
I'm trying to reproduce this in matplotlib using this example. Which looks like this:
And the plot is based on this answer: https://stackoverflow.com/a/4152016/948369
Now my issue is, like described by Brendan, that I have a continuous line that masks the outlier (I simplified the input values so you can still see them):
Any help on making the line "non-continuous" for non existent values?
Seaborn also provides a very neat example:
They call it a joy/ridge plot however: https://seaborn.pydata.org/examples/kde_ridgeplot.html
#!/usr/bin/python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Create the data
rs = np.random.RandomState(1979)
x = rs.randn(500)
g = np.tile(list("ABCDEFGHIJ"), 50)
df = pd.DataFrame(dict(x=x, g=g))
m = df.g.map(ord)
df["x"] += m
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(df, row="g", hue="g", aspect=15, size=.5, palette=pal)
# Draw the densities in a few steps
g.map(sns.kdeplot, "x", clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)
g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw=.2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, .2, label, fontweight="bold", color=color,
ha="left", va="center", transform=ax.transAxes)
g.map(label, "x")
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.25)
# Remove axes details that don't play will with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)
I would stick with a flat 2D plot and displace each level by a set vertical amount. You'll have to play the the levels (in the code below I called it displace) to properly see the outliers, but this does a pretty good job at replicating your target image. The key, I think, is to set the "zero" values to None so pylab does not draw them.
import numpy as np
import pylab as plt
import itertools
k = 20
X = np.linspace(0, 20, 500)
Y = np.zeros((k,X.size))
# Add some fake data
MU = np.random.random(k)
for n in xrange(k):
Y[n] += np.exp(-(X-MU[n]*n)**2 / (1+n/3))
Y *= 50
# Add some outliers for show
Y += 2*np.random.random(Y.shape)
displace = Y.max()/4
# Add a cutoff
Y[Y<1.0] = None
face_colors = itertools.cycle(["#D3D820", "#C9CC54",
"#D7DA66", "#FDFE42"])
fig = plt.figure()
ax = fig.add_subplot(111, axisbg='black')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
for n,y in enumerate(Y):
# Vertically displace each plot
y0 = np.ones(y.shape) * n * displace
y1 = y + n*displace
plt.fill_between(X, y0,y1,lw=1,
facecolor=face_colors.next(),
zorder=len(Y)-n)
plt.show()

Categories

Resources