So far I have been unable to find an R library that can create a sunburst plot like those by John Stasko. Anyone knows how to accomplish that in R or Python?
Python version of sunburst diagram using matplotlib bars in polar projection:
import numpy as np
import matplotlib.pyplot as plt
def sunburst(nodes, total=np.pi * 2, offset=0, level=0, ax=None):
ax = ax or plt.subplot(111, projection='polar')
if level == 0 and len(nodes) == 1:
label, value, subnodes = nodes[0]
ax.bar([0], [0.5], [np.pi * 2])
ax.text(0, 0, label, ha='center', va='center')
sunburst(subnodes, total=value, level=level + 1, ax=ax)
elif nodes:
d = np.pi * 2 / total
labels = []
widths = []
local_offset = offset
for label, value, subnodes in nodes:
labels.append(label)
widths.append(value * d)
sunburst(subnodes, total=total, offset=local_offset,
level=level + 1, ax=ax)
local_offset += value
values = np.cumsum([offset * d] + widths[:-1])
heights = [1] * len(nodes)
bottoms = np.zeros(len(nodes)) + level - 0.5
rects = ax.bar(values, heights, widths, bottoms, linewidth=1,
edgecolor='white', align='edge')
for rect, label in zip(rects, labels):
x = rect.get_x() + rect.get_width() / 2
y = rect.get_y() + rect.get_height() / 2
rotation = (90 + (360 - np.degrees(x) % 180)) % 360
ax.text(x, y, label, rotation=rotation, ha='center', va='center')
if level == 0:
ax.set_theta_direction(-1)
ax.set_theta_zero_location('N')
ax.set_axis_off()
Example, how this function can be used:
data = [
('/', 100, [
('home', 70, [
('Images', 40, []),
('Videos', 20, []),
('Documents', 5, []),
]),
('usr', 15, [
('src', 6, [
('linux-headers', 4, []),
('virtualbox', 1, []),
]),
('lib', 4, []),
('share', 2, []),
('bin', 1, []),
('local', 1, []),
('include', 1, []),
]),
]),
]
sunburst(data)
You can even build an interactive version quite easily with R now:
# devtools::install_github("timelyportfolio/sunburstR")
library(sunburstR)
# read in sample visit-sequences.csv data provided in source
# https://gist.github.com/kerryrodden/7090426#file-visit-sequences-csv
sequences <- read.csv(
system.file("examples/visit-sequences.csv",package="sunburstR")
,header=F
,stringsAsFactors = FALSE
)
sunburst(sequences)
...and when you move your mouse above it, the magic happens:
Edit
The official site of this package can be found here (with many examples!): https://github.com/timelyportfolio/sunburstR
Hat Tip to #timelyportfolio who created this impressive piece of code!
You can create something along the lines of a sunburst plot using geom_tile from the ggplot2 package. Let's first create some random data:
require(ggplot2); theme_set(theme_bw())
require(plyr)
dat = data.frame(expand.grid(x = 1:10, y = 1:10),
z = sample(LETTERS[1:3], size = 100, replace = TRUE))
And then create the raster plot. Here, the x axis in the plot is coupled to the x variable in dat, the y axis to the y variable, and the fill of the pixels to the z variable. This yields the following plot:
p = ggplot(dat, aes(x = x, y = y, fill = z)) + geom_tile()
print(p)
The ggplot2 package supports all kinds of coordinate transformations, one of which takes one axis and projects it on a circle, i.e. polar coordinates:
p + coord_polar()
This roughly does what you need, now you can tweak dat to get the desired result.
Theres a package called ggsunburst. Sadly is not in CRAN but you can install following the instruction in the website: http://genome.crg.es/~didac/ggsunburst/ggsunburst.html.
Hope it helps to people who still looking for a good package like this.
Regards,
Here's a ggplot2 sunburst with two layers.
The basic idea is to just make a different bar for each layer, and make the bars wider for the outer layers. I also messed with the x-axis to make sure there's no hole in the middle of the inner pie chart. You can thus control the look of the sunburst by changing the width and x-axis values.
library(ggplot2)
# make some fake data
df <- data.frame(
'level1'=c('a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'c'),
'level2'=c('a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'c1', 'c2', 'c3'),
'value'=c(.025, .05, .027, .005, .012, .014, .1, .03, .18))
# sunburst plot
ggplot(df, aes(y=value)) +
geom_bar(aes(fill=level1, x=0), width=.5, stat='identity') +
geom_bar(aes(fill=level2, x=.25), width=.25, stat='identity') +
coord_polar(theta='y')
The only disadvantage this has compared to sunburst-specific software is that it assumes you want the outer layers to be collectively exhaustive (i.e. no gaps). "Partially exhaustive" outer layers (like in some of the other examples) are surely possible but more complicated.
For completeness, here it is cleaned up with nicer formatting and labels:
library(data.table)
# compute cumulative sum for outer labels
df <- data.table(df)
df[, cumulative:=cumsum(value)-(value/2)]
# store labels for inner circle
inner_df <- df[, c('level1', 'value'), with=FALSE]
inner_df[, level1_value:=sum(value), by='level1']
inner_df <- unique(text_df[, c('level1', 'level1_value'), with=FALSE])
inner_df[, cumulative:=cumsum(level1_value)]
inner_df[, prev:=shift(cumulative)]
inner_df[is.na(prev), position:=(level1_value/2)]
inner_df[!is.na(prev), position:=(level1_value/2)+prev]
colors <- c('#6a3d9a', '#1F78B4', '#33A02C', '#3F146D', '#56238D', '#855CB1', '#AD8CD0', '#08619A', '#3F8DC0', '#076302', '#1B8416', '#50B74B')
colorNames <- c(unique(as.character(df$level1)), unique(as.character(df$level2)))
names(colors) <- colorNames
ggplot(df, aes(y=value, x='')) +
geom_bar(aes(fill=level2, x=.25), width=.25, stat='identity') +
geom_bar(aes(fill=level1, x=0), width=.5, stat='identity') +
geom_text(data=inner_df, aes(label=level1, x=.05, y=position)) +
coord_polar(theta='y') +
scale_fill_manual('', values=colors) +
theme_minimal() +
guides(fill=guide_legend(ncol=1)) +
labs(title='') +
scale_x_continuous(breaks=NULL) +
scale_y_continuous(breaks=df$cumulative, labels=df$level2, 5) +
theme(axis.title.x=element_blank(), axis.title.y=element_blank(), panel.border=element_blank(), panel.grid=element_blank())
There are only a couple of libraries that I know of that do this natively:
The Javascript Infovis Toolkit (jit) (example).
D3.js
OCaml's Simple Plot Tool (SPT).
Neither of these are in Python or R, but getting a python/R script to write out a simple JSON file that can be loaded by either of the javascript libraries should be pretty achievable.
Since jbkunst mentioned ggsunburst, here I post an example for reproducing the sunburst by sirex.
It is not exactly the same because in ggsunburst the angle of a node is equal to the sum of the angles of its children nodes.
# install ggsunburst package
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("rPython")) install.packages("rPython")
install.packages("http://genome.crg.es/~didac/ggsunburst/ggsunburst_0.0.9.tar.gz", repos=NULL, type="source")
library(ggsunburst)
# dataframe
# each row corresponds to a node in the hierarchy
# parent and node are required, the rest are optional attributes
# the attributes correspond to the node, not its parent
df <- read.table(header = T, sep = ",", text = "
parent,node,size,color,dist
,/,,B,1
/,home,,D,1
home,Images, 40,E,1
home,Videos, 20,E,1
home,Documents, 5,E,1
/,usr,,D,1
usr,src,,A,1
src,linux-headers, 4,C,1.5
src,virtualbox, 1,C,1.5
usr,lib, 4,A,1
usr,share, 2,A,1
usr,bin, 1,A,1
usr,local, 1,A,1
usr,include, 1,A,1
")
write.table(df, 'df.csv', sep = ",", row.names = F)
# compute coordinates from dataframe
# "node_attributes" is used to pass the attributes other than "size" and "dist",
# which are special attributes that alter the dimensions of the nodes
sb <- sunburst_data('df.csv', sep = ",", type = "node_parent", node_attributes = "color")
# plot
sunburst(sb, node_labels = T, node_labels.min = 10, rects.fill.aes = "color") +
scale_fill_brewer(palette = "Set1", guide = F)
Here is an example using R and plotly (based on my answer here):
library(datasets)
library(data.table)
library(plotly)
as.sunburstDF <- function(DF, valueCol = NULL){
require(data.table)
colNamesDF <- names(DF)
if(is.data.table(DF)){
DT <- copy(DF)
} else {
DT <- data.table(DF, stringsAsFactors = FALSE)
}
DT[, root := names(DF)[1]]
colNamesDT <- names(DT)
if(is.null(valueCol)){
setcolorder(DT, c("root", colNamesDF))
} else {
setnames(DT, valueCol, "values", skip_absent=TRUE)
setcolorder(DT, c("root", setdiff(colNamesDF, valueCol), "values"))
}
hierarchyCols <- setdiff(colNamesDT, "values")
hierarchyList <- list()
for(i in seq_along(hierarchyCols)){
currentCols <- colNamesDT[1:i]
if(is.null(valueCol)){
currentDT <- unique(DT[, ..currentCols][, values := .N, by = currentCols], by = currentCols)
} else {
currentDT <- DT[, lapply(.SD, sum, na.rm = TRUE), by=currentCols, .SDcols = "values"]
}
setnames(currentDT, length(currentCols), "labels")
hierarchyList[[i]] <- currentDT
}
hierarchyDT <- rbindlist(hierarchyList, use.names = TRUE, fill = TRUE)
parentCols <- setdiff(names(hierarchyDT), c("labels", "values", valueCol))
hierarchyDT[, parents := apply(.SD, 1, function(x){fifelse(all(is.na(x)), yes = NA_character_, no = paste(x[!is.na(x)], sep = ":", collapse = " - "))}), .SDcols = parentCols]
hierarchyDT[, ids := apply(.SD, 1, function(x){paste(x[!is.na(x)], collapse = " - ")}), .SDcols = c("parents", "labels")]
hierarchyDT[, c(parentCols) := NULL]
return(hierarchyDT)
}
DF <- as.data.table(Titanic)
setcolorder(DF, c("Survived", "Class", "Sex", "Age", "N"))
sunburstDF <- as.sunburstDF(DF, valueCol = "N")
# Sunburst
plot_ly(data = sunburstDF, ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='sunburst', branchvalues = 'total')
# Treemap
# plot_ly(data = sunburstDF, ids = ~ids, labels= ~labels, parents = ~parents, values= ~values, type='treemap', branchvalues = 'total')
Some additional information can be found here.
You can also use plotly Sunburst on python as well as seen here
The same inputs can be used to create Icicle and Treemap graphs (supported too by plotly) which might also suit your needs.
Related
I want to combine hover and click in selections in Altair plot. The code below produces the result that I want: points are mostly transparent by default, hovering over the point increases the opacity, and then clicking on the point increases the opacity even more. I find this useful so that users can hover over a point to get a quick sense of the results, and then click on the point to "lock" the selection. While I am satisfied with the results, the method seems a bit cumbersome because I need to define different chart layers for the hover and click selections. If I could construct a multi-way condition expression, then it seems like I would be able to simplify the code quite a bit. I tried writing the opacity condition as alt.condition(click_selection, CLICK_OPACITY, alt.condition(hover_selection, HOVER_OPACITY, DEFAULT_OPACITY)), but I got an error. Is there a way to simplify my code below to combine hover and click selections?
import altair as alt
import numpy as np
import pandas as pd
a_values = np.arange(1, 4)
x_values = np.linspace(0, 2, 1000)
DEFAULT_OPACITY = 0.3
HOVER_OPACITY = 0.5
CLICK_OPACITY = 1.0
a_df = pd.DataFrame({'a': a_values})
df = pd.DataFrame({
'a': np.tile(A=a_values, reps=len(x_values)),
'x': np.repeat(a=x_values, repeats=len(a_values)),
})
df['y'] = df['a'] * np.sin(2 * np.pi * df['x'])
hover_selection = alt.selection_single(
clear='mouseout',
empty='none',
fields=['a'],
name='hover_selection',
on='mouseover',
)
click_selection = alt.selection_single(
empty='none',
fields=['a'],
name='click_selection',
on='click',
)
a_base = alt.Chart(a_df).mark_point(
filled=True, size=100,
).encode(
x=alt.X(shorthand='a:Q', scale=alt.Scale(domain=(min(a_values) - 1, max(a_values) + 1))),
y=alt.Y(shorthand='a:Q', scale=alt.Scale(domain=(min(a_values) - 1, max(a_values) + 1))),
)
a_hover = a_base.encode(
opacity=alt.condition(hover_selection, alt.value(HOVER_OPACITY), alt.value(DEFAULT_OPACITY))
).add_selection(hover_selection)
a_click = a_base.encode(
opacity=alt.condition(click_selection, alt.value(CLICK_OPACITY), alt.value(0.0)),
).add_selection(click_selection)
y_base = alt.Chart(df).mark_line().encode(
x=alt.X(shorthand='x:Q', scale=alt.Scale(domain=(0, 2))),
y=alt.Y(shorthand='y:Q', scale=alt.Scale(domain=(-3, 3))),
)
y_hover = y_base.encode(
opacity=alt.value(HOVER_OPACITY),
).transform_filter(hover_selection)
y_click = y_base.encode(
opacity=alt.value(CLICK_OPACITY),
).transform_filter(click_selection)
alt.hconcat(
alt.layer(a_hover, a_click),
alt.layer(y_hover, y_click),
)
VegaLite supports multiple selections in the same condition, but I don't think it is possible to write within alt.Condition. However, you can see the alt.Condition returns a dictionary so you could write this directly passing a list of selections.
This way you could clarify this section
a_base = alt.Chart(a_df).mark_point(
filled=True, size=100,
).encode(
x=alt.X(shorthand='a:Q', scale=alt.Scale(domain=(min(a_values) - 1, max(a_values) + 1))),
y=alt.Y(shorthand='a:Q', scale=alt.Scale(domain=(min(a_values) - 1, max(a_values) + 1))),
)
a_hover = a_base.encode(
opacity=alt.condition(hover_selection, alt.value(HOVER_OPACITY), alt.value(DEFAULT_OPACITY))
).add_selection(hover_selection)
a_click = a_base.encode(
opacity=alt.condition(click_selection, alt.value(CLICK_OPACITY), alt.value(0.0)),
).add_selection(click_selection)
to something like this:
hover_and_click_condition = {
'condition': [
{'selection': 'hover_selection', 'value': HOVER_OPACITY},
{'selection': 'click_selection', 'value': CLICK_OPACITY}],
'value': DEFAULT_OPACITY}
a = alt.Chart(a_df).mark_point(
filled=True, size=100,
).encode(
x=alt.X(shorthand='a:Q', scale=alt.Scale(domain=(min(a_values) - 1, max(a_values) + 1))),
y=alt.Y(shorthand='a:Q', scale=alt.Scale(domain=(min(a_values) - 1, max(a_values) + 1))),
opacity=hover_and_click_condition
).add_selection(hover_selection, click_selection)
For the transform filters, you could rewrite this section
y_base = alt.Chart(df).mark_line().encode(
x=alt.X(shorthand='x:Q', scale=alt.Scale(domain=(0, 2))),
y=alt.Y(shorthand='y:Q', scale=alt.Scale(domain=(-3, 3))),
)
y_hover = y_base.encode(
opacity=alt.value(HOVER_OPACITY),
).transform_filter(hover_selection)
y_click = y_base.encode(
opacity=alt.value(CLICK_OPACITY),
).transform_filter(click_selection)
alt.hconcat(
alt.layer(a_hover, a_click),
alt.layer(y_hover, y_click),
)
like this:
y = alt.Chart(df).mark_line().encode(
x=alt.X(shorthand='x:Q', scale=alt.Scale(domain=(0, 2))),
y=alt.Y(shorthand='y:Q', scale=alt.Scale(domain=(-3, 3))),
opacity=hover_and_click_condition
)
a | (y.transform_filter(click_selection) + y.transform_filter(hover_selection))
I am trying to create a custom coloring for an animated choropleth map. I am using Plotly express and my dataframe looks like this.
where I am plotting the values on each region (region code=K_KRAJ, region name=N_KRAJ) and my animation is over the variables.
The values are in percentages so the min is 0 and max is 1. I want to divide the colors into 6 parts with exactly the midpoints as written here in color_continous_scale
fig = px.choropleth(df_anim,
locations="K_KRAJ",
featureidkey="properties.K_KRAJ",
geojson=regions_json,
color="value",
hover_name="N_KRAJ",
color_continuous_scale=[(0.0, "#e5e5e5"), (0.0001, "#e5e5e5"),
(0.0001, "#ffe5f0"), (0.0075, "#ffe5f0"),
(0.0075, "#facfdf"), (0.01, "#facfdf"),
(0.01, "#f3b8ce"), (0.025, "#f3b8ce"),
(0.025, "#eca2bf"), (0.05, "#eca2bf"),
(0.05, "#e37fb1"), (1, "#e37fb1")
],
animation_frame="variable"
)
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
Unfortunately, that creates a wrong map like this
instead of a map like this
the second map which is almost correct was created using the largest value as 100% and mathematically finding the midpoints. Even though this is very close to being correct, there can always be numerical mistakes and I would rather use the code shown above if it worked correctly.
the almost correct one was created like this (max value was 0.06821107602623269)
color_continuous_scale=[(0.0, "#e5e5e5"), (0.001449275362, "#e5e5e5"), # 0.01% , 0.0001
(0.01449275362, "#ffe5f0"), (0.1086956522, "#ffe5f0"), # 0.75% , 0.0075
(0.1086956522, "#facfdf"), (0.1449275362, "#facfdf"), # 1% , 0.01
(0.1449275362, "#f3b8ce"), (0.3623188406, "#f3b8ce"), # 2.5% , 0.025
(0.3623188406, "#eca2bf"), (0.7246376812, "#eca2bf"), # 5% , 0.05
(0.7246376812, "#e37fb1"), (1, "#e37fb1") # 6.9% , 0.069
],
And even best if someone knew how to change the numbers in the colorscale which is shown in the images on the right from numbers to percentages (0.05 -> 5%)
If I add range_color=(0, 1) it adds the correct colors but then there is a useless colorbar on the right.
color_continuous_scale is a Plotly Express construct not limited to choropleths. Hence technique presented is how to build a color scale
I cannot find a repeatable source of Czech region geometry, hence code below does not work as an MWE without you have geometry in your own downloads folder
core solution
given you want six bins, start by using pd.cut() and get the bin edges
with this scale them to be between 0 and 1 to work with color scales
construct colorscale with hard edges
edges = pd.cut(df_anim["value"], bins=5, retbins=True)[1]
edges = edges[:-1] / edges[-1]
colors = ["#e5e5e5", "#ffe5f0", "#facfdf", "#f3b8ce", "#eca2bf", "#e37fb1"]
cc_scale = (
[(0, colors[0])]
+ [(e, colors[(i + 1) // 2]) for i, e in enumerate(np.repeat(edges, 2))]
+ [(1, colors[5])]
)
from pathlib import Path
import geopandas as gpd
import pandas as pd
import numpy as np
import plotly.express as px
# simulate source data
gdf = gpd.read_file(
list(Path.home().joinpath("Downloads/WGS84").glob("*KRAJ*.shp"))[0]
).set_crs("epsg:4326")
gdf["geometry"] = gdf.to_crs(gdf.estimate_utm_crs()).simplify(2000).to_crs(gdf.crs)
regions_json = gdf.__geo_interface__
df = (
pd.json_normalize(regions_json["features"])
.pipe(lambda d: d.loc[:, [c.strip() for c in d.columns if c[0:3] == "pro"]])
.rename(columns={"properties.ID": "K_KRAJ", "properties.NAZEV_NUTS": "N_KRAJ"})
)
df_anim = df.merge(
pd.DataFrame(
{"variable": [f"REL{n1}{n2}" for n1 in range(15, 21) for n2 in ["06", "12"]]}
),
how="cross",
).pipe(lambda d: d.assign(value=np.random.uniform(0, 0.003, len(d))))
# end data simulation
edges = pd.cut(df_anim["value"], bins=5, retbins=True)[1]
edges = edges[:-1] / edges[-1]
colors = ["#e5e5e5", "#ffe5f0", "#facfdf", "#f3b8ce", "#eca2bf", "#e37fb1"]
cc_scale = (
[(0, colors[0])]
+ [(e, colors[(i + 1) // 2]) for i, e in enumerate(np.repeat(edges, 2))]
+ [(1, colors[5])]
)
fig = px.choropleth(
df_anim,
locations="K_KRAJ",
featureidkey="properties.ID", ### ! changed !
geojson=regions_json,
color="value",
hover_name="N_KRAJ",
color_continuous_scale=cc_scale,
animation_frame="variable",
)
fig.update_geos(fitbounds="locations", visible=False)
I want to plot a dendrogram plot for hierarchical clustering using plotly and show a small subset of the plot as with the large number of samples the plot can be very dense at the bottom.
I have plotted the plot using the plotly wrapper function create_dendrogram with the below code:
from scipy.cluster.hierarchy import linkage
import plotly.figure_factory as ff
fig = ff.create_dendrogram(test_df, linkagefun=lambda x: linkage(test_df, 'average', metric='euclidean'))
fig.update_layout(autosize=True, hovermode='closest')
fig.update_xaxes(mirror=False, showgrid=True, showline=False, showticklabels=False)
fig.update_yaxes(mirror=False, showgrid=True, showline=True)
fig.show()
And below is the plot using matplotlib which is used by default by the scipy library truncated to 4 levels for ease of interpretation:
from scipy.cluster.hierarchy import dendrogram,linkage
x = linkage(test_df,method='average')
dendrogram(x,truncate_mode='level',p=4)
plt.show()
As you can see the truncation is very useful to interpret large number of samples, how can i acheive this in plotly ?
There does not seem to be a straight-forward way to do this with ff.create_dendrogram(). This does not mean it's impossible though. But I would at least consider the brilliant functionalities that Dash Clustergram has to offer. If you insist on sticking to ff.create_dendrogram(), this is going to get a bit more messy than Plotly users rightfully have grown accustomed to. You haven't provided a data sample, so let's use the Plotly Basic Dendrogram example instead:
Plot 1
Code 1
import plotly.figure_factory as ff
import numpy as np
np.random.seed(1)
X = np.random.rand(15, 12) # 15 samples, with 12 dimensions each
fig = ff.create_dendrogram(X)
fig.update_layout(width=800, height=500)
f = fig.full_figure_for_development(warn=False)
fig.show()
The good news is that the exact same snippet will produce the following truncated plot after we've taken a few steps that I'll explain in the details below.
Plot 2
The details
If anyone who got this far in my answer knows a better way to do the following, then please share.
1. ff.create_dendrogram() is a wrapper for scipy.cluster.hierarchy.dendrogram
You can call help(ff.create_dendrogram) and learn that:
[...]This is a thin wrapper around scipy.cluster.hierarchy.dendrogram.
From the available arguments you can also see that none seem to handle anything related to truncating:
create_dendrogram(X, orientation='bottom', labels=None,
colorscale=None, distfun=None, linkagefun=<function at
0x0000016F09D4CEE0>, hovertext=None, color_threshold=None)
2. Take a closer look at scipy.cluster.hierarchy.dendrogram
Here we can see that some central elements have been left out after implementing the function in ff.create_dendrogram(X) when we compare it to the source:
scipy.cluster.hierarchy.dendrogram(Z, p=30, truncate_mode=None, color_threshold=None, get_leaves=True, orientation='top', labels=None, count_sort=False, distance_sort=False, show_leaf_counts=True, no_plot=False, no_labels=False, leaf_font_size=None, leaf_rotation=None, leaf_label_func=None, show_contracted=False, link_color_func=None, ax=None, above_threshold_color='C0')
truncate_mode should be exactly what we're looking for. So, now we know that scipy probably has all we need to build the foundation for a truncated dendrogram, but what's next?
3. Find where scipy.cluster.hierarchy.dendrogram is hiding in ff.create_dendrogram(X)
ff.create_dendrogram.__code__ will reveal where the source code exists in your system. In my case this is:
"C:\Users\vestland\Miniconda3\envs\dashy\lib\site-packages\plotly\figure_factory\_dendrogram.py"
So if you would like you can take a closer look at the complete source in your corresponding folder. If you do, you'll see one particularly interesting section where some attributes that we have listed above are taken care of:
def get_dendrogram_traces(
self, X, colorscale, distfun, linkagefun, hovertext, color_threshold
):
"""
Calculates all the elements needed for plotting a dendrogram.
.
.
.
P = sch.dendrogram(
Z,
orientation=self.orientation,
labels=self.labels,
no_plot=True,
color_threshold=color_threshold,
)
Here we are at the very core of the problem. And the first step to a complete answer to your question is simply to include truncate_mode and p in P like this:
P = sch.dendrogram(
Z,
orientation=self.orientation,
labels=self.labels,
no_plot=True,
color_threshold=color_threshold,
truncate_mode = 'level',
p = 2
)
And here's how you do that:
4. Monkey patching
In Python, the term monkey patch only refers to dynamic modifications of a class or module at runtime, which means monkey patch is a piece of Python code that extends or modifies other code at runtime. And here's the essence of how you can do exactly that in our case:
import plotly.figure_factory._dendrogram as original_dendrogram
original_dendrogram._Dendrogram.get_dendrogram_traces = modified_dendrogram_traces
Where modified_dendrogram_traces is the complete function definition of modified_dendrogram_traces() with the amendments I've already mentioned. As well as a few imports that will be missing that otherwise are run when you call import plotly.figure_factory as ff
Enough details for now. Below is the whole thing. If this is something you can use, we could perhaps make the whole thing a bit more dynamical than hardcoding truncate_mode = 'level' and p = 2.
Complete code:
from scipy.cluster.hierarchy import linkage
import plotly.figure_factory as ff
import plotly.figure_factory._dendrogram as original_dendrogram
import numpy as np
def modified_dendrogram_traces(
self, X, colorscale, distfun, linkagefun, hovertext, color_threshold
):
"""
Calculates all the elements needed for plotting a dendrogram.
:param (ndarray) X: Matrix of observations as array of arrays
:param (list) colorscale: Color scale for dendrogram tree clusters
:param (function) distfun: Function to compute the pairwise distance
from the observations
:param (function) linkagefun: Function to compute the linkage matrix
from the pairwise distances
:param (list) hovertext: List of hovertext for constituent traces of dendrogram
:rtype (tuple): Contains all the traces in the following order:
(a) trace_list: List of Plotly trace objects for dendrogram tree
(b) icoord: All X points of the dendrogram tree as array of arrays
with length 4
(c) dcoord: All Y points of the dendrogram tree as array of arrays
with length 4
(d) ordered_labels: leaf labels in the order they are going to
appear on the plot
(e) P['leaves']: left-to-right traversal of the leaves
"""
import plotly
from plotly import exceptions, optional_imports
np = optional_imports.get_module("numpy")
scp = optional_imports.get_module("scipy")
sch = optional_imports.get_module("scipy.cluster.hierarchy")
scs = optional_imports.get_module("scipy.spatial")
sch = optional_imports.get_module("scipy.cluster.hierarchy")
d = distfun(X)
Z = linkagefun(d)
P = sch.dendrogram(
Z,
orientation=self.orientation,
labels=self.labels,
no_plot=True,
color_threshold=color_threshold,
truncate_mode = 'level',
p = 2
)
icoord = scp.array(P["icoord"])
dcoord = scp.array(P["dcoord"])
ordered_labels = scp.array(P["ivl"])
color_list = scp.array(P["color_list"])
colors = self.get_color_dict(colorscale)
trace_list = []
for i in range(len(icoord)):
# xs and ys are arrays of 4 points that make up the '∩' shapes
# of the dendrogram tree
if self.orientation in ["top", "bottom"]:
xs = icoord[i]
else:
xs = dcoord[i]
if self.orientation in ["top", "bottom"]:
ys = dcoord[i]
else:
ys = icoord[i]
color_key = color_list[i]
hovertext_label = None
if hovertext:
hovertext_label = hovertext[i]
trace = dict(
type="scatter",
x=np.multiply(self.sign[self.xaxis], xs),
y=np.multiply(self.sign[self.yaxis], ys),
mode="lines",
marker=dict(color=colors[color_key]),
text=hovertext_label,
hoverinfo="text",
)
try:
x_index = int(self.xaxis[-1])
except ValueError:
x_index = ""
try:
y_index = int(self.yaxis[-1])
except ValueError:
y_index = ""
trace["xaxis"] = "x" + x_index
trace["yaxis"] = "y" + y_index
trace_list.append(trace)
return trace_list, icoord, dcoord, ordered_labels, P["leaves"]
original_dendrogram._Dendrogram.get_dendrogram_traces = modified_dendrogram_traces
X = np.random.rand(15, 12) # 15 samples, with 12 dimensions each
fig = ff.create_dendrogram(X)
fig.update_layout(width=800, height=500)
f = fig.full_figure_for_development(warn=False)
fig.show()
To make it more dynamically you can pass **kwargs to create_dendogram() function. If you check the source code, you need to pass **kwargs in multiple other places, both in _Dendogram class and get_dendrogram_traces() function.
If you don't want to mess with _dendogram.py which located in the default directory, I advise you to copy whole file and create a new file (Lets say modified_dendogram.py) in your current directory.
Then simply import that local file using from modified_dendogram import create_dendrogram.
Now you can use all the arguments that scipy.cluster.hierarchy.dendrogram supports.
modified_dendogram.py:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from collections import OrderedDict
from plotly import exceptions, optional_imports
from plotly.graph_objs import graph_objs
# Optional imports, may be None for users that only use our core functionality.
np = optional_imports.get_module("numpy")
scp = optional_imports.get_module("scipy")
sch = optional_imports.get_module("scipy.cluster.hierarchy")
scs = optional_imports.get_module("scipy.spatial")
def create_dendrogram(
X,
orientation="bottom",
labels=None,
colorscale=None,
distfun=None,
linkagefun=lambda x: sch.linkage(x, "complete"),
hovertext=None,
color_threshold=None,
**kwargs
):
"""
Function that returns a dendrogram Plotly figure object. This is a thin
wrapper around scipy.cluster.hierarchy.dendrogram.
See also https://dash.plot.ly/dash-bio/clustergram.
:param (ndarray) X: Matrix of observations as array of arrays
:param (str) orientation: 'top', 'right', 'bottom', or 'left'
:param (list) labels: List of axis category labels(observation labels)
:param (list) colorscale: Optional colorscale for the dendrogram tree.
Requires 8 colors to be specified, the 7th of
which is ignored. With scipy>=1.5.0, the 2nd, 3rd
and 6th are used twice as often as the others.
Given a shorter list, the missing values are
replaced with defaults and with a longer list the
extra values are ignored.
:param (function) distfun: Function to compute the pairwise distance from
the observations
:param (function) linkagefun: Function to compute the linkage matrix from
the pairwise distances
:param (list[list]) hovertext: List of hovertext for constituent traces of dendrogram
clusters
:param (double) color_threshold: Value at which the separation of clusters will be made
Example 1: Simple bottom oriented dendrogram
>>> from plotly.figure_factory import create_dendrogram
>>> import numpy as np
>>> X = np.random.rand(10,10)
>>> fig = create_dendrogram(X)
>>> fig.show()
Example 2: Dendrogram to put on the left of the heatmap
>>> from plotly.figure_factory import create_dendrogram
>>> import numpy as np
>>> X = np.random.rand(5,5)
>>> names = ['Jack', 'Oxana', 'John', 'Chelsea', 'Mark']
>>> dendro = create_dendrogram(X, orientation='right', labels=names)
>>> dendro.update_layout({'width':700, 'height':500}) # doctest: +SKIP
>>> dendro.show()
Example 3: Dendrogram with Pandas
>>> from plotly.figure_factory import create_dendrogram
>>> import numpy as np
>>> import pandas as pd
>>> Index= ['A','B','C','D','E','F','G','H','I','J']
>>> df = pd.DataFrame(abs(np.random.randn(10, 10)), index=Index)
>>> fig = create_dendrogram(df, labels=Index)
>>> fig.show()
"""
if not scp or not scs or not sch:
raise ImportError(
"FigureFactory.create_dendrogram requires scipy, \
scipy.spatial and scipy.hierarchy"
)
s = X.shape
if len(s) != 2:
exceptions.PlotlyError("X should be 2-dimensional array.")
if distfun is None:
distfun = scs.distance.pdist
dendrogram = _Dendrogram(
X,
orientation,
labels,
colorscale,
distfun=distfun,
linkagefun=linkagefun,
hovertext=hovertext,
color_threshold=color_threshold,
kwargs=kwargs
)
return graph_objs.Figure(data=dendrogram.data, layout=dendrogram.layout)
class _Dendrogram(object):
"""Refer to FigureFactory.create_dendrogram() for docstring."""
def __init__(
self,
X,
orientation="bottom",
labels=None,
colorscale=None,
width=np.inf,
height=np.inf,
xaxis="xaxis",
yaxis="yaxis",
distfun=None,
linkagefun=lambda x: sch.linkage(x, "complete"),
hovertext=None,
color_threshold=None,
kwargs=None
):
self.orientation = orientation
self.labels = labels
self.xaxis = xaxis
self.yaxis = yaxis
self.data = []
self.leaves = []
self.sign = {self.xaxis: 1, self.yaxis: 1}
self.layout = {self.xaxis: {}, self.yaxis: {}}
if self.orientation in ["left", "bottom"]:
self.sign[self.xaxis] = 1
else:
self.sign[self.xaxis] = -1
if self.orientation in ["right", "bottom"]:
self.sign[self.yaxis] = 1
else:
self.sign[self.yaxis] = -1
if distfun is None:
distfun = scs.distance.pdist
(dd_traces, xvals, yvals, ordered_labels, leaves) = self.get_dendrogram_traces(
X, colorscale, distfun, linkagefun, hovertext, color_threshold, kwargs
)
self.labels = ordered_labels
self.leaves = leaves
yvals_flat = yvals.flatten()
xvals_flat = xvals.flatten()
self.zero_vals = []
for i in range(len(yvals_flat)):
if yvals_flat[i] == 0.0 and xvals_flat[i] not in self.zero_vals:
self.zero_vals.append(xvals_flat[i])
if len(self.zero_vals) > len(yvals) + 1:
# If the length of zero_vals is larger than the length of yvals,
# it means that there are wrong vals because of the identicial samples.
# Three and more identicial samples will make the yvals of spliting
# center into 0 and it will accidentally take it as leaves.
l_border = int(min(self.zero_vals))
r_border = int(max(self.zero_vals))
correct_leaves_pos = range(
l_border, r_border + 1, int((r_border - l_border) / len(yvals))
)
# Regenerating the leaves pos from the self.zero_vals with equally intervals.
self.zero_vals = [v for v in correct_leaves_pos]
self.zero_vals.sort()
self.layout = self.set_figure_layout(width, height)
self.data = dd_traces
def get_color_dict(self, colorscale):
"""
Returns colorscale used for dendrogram tree clusters.
:param (list) colorscale: Colors to use for the plot in rgb format.
:rtype (dict): A dict of default colors mapped to the user colorscale.
"""
# These are the color codes returned for dendrograms
# We're replacing them with nicer colors
# This list is the colors that can be used by dendrogram, which were
# determined as the combination of the default above_threshold_color and
# the default color palette (see scipy/cluster/hierarchy.py)
d = {
"r": "red",
"g": "green",
"b": "blue",
"c": "cyan",
"m": "magenta",
"y": "yellow",
"k": "black",
# TODO: 'w' doesn't seem to be in the default color
# palette in scipy/cluster/hierarchy.py
"w": "white",
}
default_colors = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
if colorscale is None:
rgb_colorscale = [
"rgb(0,116,217)", # blue
"rgb(35,205,205)", # cyan
"rgb(61,153,112)", # green
"rgb(40,35,35)", # black
"rgb(133,20,75)", # magenta
"rgb(255,65,54)", # red
"rgb(255,255,255)", # white
"rgb(255,220,0)", # yellow
]
else:
rgb_colorscale = colorscale
for i in range(len(default_colors.keys())):
k = list(default_colors.keys())[i] # PY3 won't index keys
if i < len(rgb_colorscale):
default_colors[k] = rgb_colorscale[i]
# add support for cyclic format colors as introduced in scipy===1.5.0
# before this, the colors were named 'r', 'b', 'y' etc., now they are
# named 'C0', 'C1', etc. To keep the colors consistent regardless of the
# scipy version, we try as much as possible to map the new colors to the
# old colors
# this mapping was found by inpecting scipy/cluster/hierarchy.py (see
# comment above).
new_old_color_map = [
("C0", "b"),
("C1", "g"),
("C2", "r"),
("C3", "c"),
("C4", "m"),
("C5", "y"),
("C6", "k"),
("C7", "g"),
("C8", "r"),
("C9", "c"),
]
for nc, oc in new_old_color_map:
try:
default_colors[nc] = default_colors[oc]
except KeyError:
# it could happen that the old color isn't found (if a custom
# colorscale was specified), in this case we set it to an
# arbitrary default.
default_colors[nc] = "rgb(0,116,217)"
return default_colors
def set_axis_layout(self, axis_key):
"""
Sets and returns default axis object for dendrogram figure.
:param (str) axis_key: E.g., 'xaxis', 'xaxis1', 'yaxis', yaxis1', etc.
:rtype (dict): An axis_key dictionary with set parameters.
"""
axis_defaults = {
"type": "linear",
"ticks": "outside",
"mirror": "allticks",
"rangemode": "tozero",
"showticklabels": True,
"zeroline": False,
"showgrid": False,
"showline": True,
}
if len(self.labels) != 0:
axis_key_labels = self.xaxis
if self.orientation in ["left", "right"]:
axis_key_labels = self.yaxis
if axis_key_labels not in self.layout:
self.layout[axis_key_labels] = {}
self.layout[axis_key_labels]["tickvals"] = [
zv * self.sign[axis_key] for zv in self.zero_vals
]
self.layout[axis_key_labels]["ticktext"] = self.labels
self.layout[axis_key_labels]["tickmode"] = "array"
self.layout[axis_key].update(axis_defaults)
return self.layout[axis_key]
def set_figure_layout(self, width, height):
"""
Sets and returns default layout object for dendrogram figure.
"""
self.layout.update(
{
"showlegend": False,
"autosize": False,
"hovermode": "closest",
"width": width,
"height": height,
}
)
self.set_axis_layout(self.xaxis)
self.set_axis_layout(self.yaxis)
return self.layout
def get_dendrogram_traces(
self, X, colorscale, distfun, linkagefun, hovertext, color_threshold, kwargs={}
):
"""
Calculates all the elements needed for plotting a dendrogram.
:param (ndarray) X: Matrix of observations as array of arrays
:param (list) colorscale: Color scale for dendrogram tree clusters
:param (function) distfun: Function to compute the pairwise distance
from the observations
:param (function) linkagefun: Function to compute the linkage matrix
from the pairwise distances
:param (list) hovertext: List of hovertext for constituent traces of dendrogram
:rtype (tuple): Contains all the traces in the following order:
(a) trace_list: List of Plotly trace objects for dendrogram tree
(b) icoord: All X points of the dendrogram tree as array of arrays
with length 4
(c) dcoord: All Y points of the dendrogram tree as array of arrays
with length 4
(d) ordered_labels: leaf labels in the order they are going to
appear on the plot
(e) P['leaves']: left-to-right traversal of the leaves
"""
d = distfun(X)
Z = linkagefun(d)
P = sch.dendrogram(
Z,
orientation=self.orientation,
labels=self.labels,
no_plot=True,
color_threshold=color_threshold,
**kwargs
)
icoord = scp.array(P["icoord"])
dcoord = scp.array(P["dcoord"])
ordered_labels = scp.array(P["ivl"])
color_list = scp.array(P["color_list"])
colors = self.get_color_dict(colorscale)
trace_list = []
for i in range(len(icoord)):
# xs and ys are arrays of 4 points that make up the '∩' shapes
# of the dendrogram tree
if self.orientation in ["top", "bottom"]:
xs = icoord[i]
else:
xs = dcoord[i]
if self.orientation in ["top", "bottom"]:
ys = dcoord[i]
else:
ys = icoord[i]
color_key = color_list[i]
hovertext_label = None
if hovertext:
hovertext_label = hovertext[i]
trace = dict(
type="scatter",
x=np.multiply(self.sign[self.xaxis], xs),
y=np.multiply(self.sign[self.yaxis], ys),
mode="lines",
marker=dict(color=colors[color_key]),
text=hovertext_label,
hoverinfo="text",
)
try:
x_index = int(self.xaxis[-1])
except ValueError:
x_index = ""
try:
y_index = int(self.yaxis[-1])
except ValueError:
y_index = ""
trace["xaxis"] = "x" + x_index
trace["yaxis"] = "y" + y_index
trace_list.append(trace)
return trace_list, icoord, dcoord, ordered_labels, P["leaves"]
Example:
from modified_dendogram import create_dendrogram
import numpy as np
np.random.seed(1)
X = np.random.rand(15, 12) # 15 samples, with 12 dimensions each
fig = create_dendrogram(X)
fig.update_layout(width=800, height=500)
fig.show()
from utils.modified_dendogram import create_dendrogram
import numpy as np
np.random.seed(1)
X = np.random.rand(15, 12) # 15 samples, with 12 dimensions each
fig = create_dendrogram(X, truncate_mode="level", p=1)
fig.update_layout(width=800, height=500)
fig.show()
I'm trying to generate a faceted boxplot for linear model results, with treatments on the x axis. A conventional way to show significance is to append asterisks.
I'm finding this surprisingly difficult to do in plotly.
Example code:
import numpy as np
import pandas as pd
# Data
n = 10
conditiona = ['left', 'right']
conditionb = ['top', 'middle', 'bottom']
N = n * len(conditiona) * len(conditionb)
trt = np.repeat(['a', 'b','c'], N)
eff = np.repeat([3, 2, 1], N)
noise = np.random.normal(size = 3* N, loc = 0, scale = 1)
pval = np.repeat(['**', '', ''], N)
col = np.tile( np.repeat( conditiona, n * len(conditionb)), 3)
row = np.tile( np.repeat( conditionb, n) , len(conditiona) *3)
df = pd.DataFrame( { 'y' : noise + eff, 'trt' : trt, 'p' : pval, 'column' : col,
'row' : row})
## Plot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
rows = df.row.unique().tolist()
cols = df.column.unique().tolist()
groups = df.trt.unique().tolist()
labs = [i + ' ' + j for j in rows for i in cols]
colors = ['red', 'green', 'blue']
fig = make_subplots(rows = len(rows), cols = len(cols),
shared_xaxes=True, subplot_titles = labs)
for group, dx in df.groupby(['row','column','trt']):
r = rows.index( group[0] ) + 1 # 1-based numbering
c = cols.index( group[1] ) + 1
name = str(group[2])
id = groups.index(group[2])
tr = go.Box( y = dx['y'], boxpoints = 'all', name = name ,marker_color = colors[id], text = dx['p'])
# tr2 = go.Scatter(x = 'x0', <- how do I get relative x coordinates of tr to put in here ?
# y = dx['y'].median(), text = dx['p'].unique())
fig.add_trace( tr, row = r, col = c )
fig.show()
[Desired] Output:
Is there an easy way to 'extract' the x coordinates of a box trace so I can overlay a marker?
Seems like this shouldn't be hard.
Figured it out eventually. You just have to know how plotly sets things up beforehand, apparently.
You can use annotations with xref and yref referencingthe subplots. The pattern of assignment is confusing (to me) and poorly documented.
y_refs increase sequentially from the bottom left, reading left to right. Thus in this figure bottom left panel is 'y1', bottom right is 'y2', middle left is 'y3' , middle right is 'y4' and so on.
ncol = len(cols)
fig = make_subplots(rows = len(rows), cols = len(cols),
shared_xaxes=True, subplot_titles = labs)
for group, dx in df.groupby(['row','column','trt']):
r = rows.index( group[0] ) + 1 # 1-based numbering
c = cols.index( group[1] ) + 1
name = str(group[2])
id = groups.index(group[2])
tr = go.Box( y = dx['y'], boxpoints = 'all', name = name ,marker_color = colors[id], text = dx['p'])
fig.add_trace( tr, row = r, col = c )
xref = 'x' + str(c)
yref = 'y' + str( (r-1)*ncol + c ) # yrefs added in strange pattern
fig.add_annotation(x = name,
y = dx.y.median(),
text = dx.p.unique()[0],
ax = 0, ay = 0,showarrow = False,
xref = xref, yref = yref,
font= dict(size = 24))
fig.show()
In Tableau I'm used to making graphs like the one below. It has for each day (or some other discrete variable), a stacked bar of categories of different colours, heights and widths.
You can imagine the categories to be different advertisements that I show to people. The heights correspond to the percentage of people I've shown the advertisement to, and the widths correspond to the rate of acceptance.
It allows me to see very easily which advertisements I should probably show more often (short, but wide bars, like the 'C' category on September 13th and 14th) and which I should show less often (tall, narrow bars, like the 'H' category on September 16th).
Any ideas on how I could create a graph like this in R or Python?
Unfortunately, this is not so trivial to achieve with ggplot2 (I think), because geom_bar does not really support changing widths for the same x position. But with a bit of effort, we can achieve the same result:
Create some fake data
set.seed(1234)
d <- as.data.frame(expand.grid(adv = LETTERS[1:7], day = 1:5))
d$height <- runif(7*5, 1, 3)
d$width <- runif(7*5, 0.1, 0.3)
My data doesn't add up to 100%, cause I'm lazy.
head(d, 10)
# adv day height width
# 1 A 1 1.227407 0.2519341
# 2 B 1 2.244599 0.1402496
# 3 C 1 2.218549 0.1517620
# 4 D 1 2.246759 0.2984301
# 5 E 1 2.721831 0.2614705
# 6 F 1 2.280621 0.2106667
# 7 G 1 1.018992 0.2292812
# 8 A 2 1.465101 0.1623649
# 9 B 2 2.332168 0.2243638
# 10 C 2 2.028502 0.1659540
Make a new variable for stacking
We can't easily use position_stack I think, so we'll just do that part ourselves. Basically, we need to calculate the cumulative height for every bar, grouped by day. Using dplyr we can do that very easily.
library(dplyr)
d2 <- d %>% group_by(day) %>% mutate(cum_height = cumsum(height))
Make the plot
Finally, we create the plot. Note that the x and y refer to the middle of the tiles.
library(ggplot2)
ggplot(d2, aes(x = day, y = cum_height - 0.5 * height, fill = adv)) +
geom_tile(aes(width = width, height = height), show.legend = FALSE) +
geom_text(aes(label = adv)) +
scale_fill_brewer(type = 'qual', palette = 2) +
labs(title = "Views and other stuff", y = "% of views")
If you don't want to play around with correctly scaling the widths (to something < 1), you can use facets instead:
ggplot(d2, aes(x = 1, y = cum_height - 0.5 * height, fill = adv)) +
geom_tile(aes(width = width, height = height), show.legend = FALSE) +
geom_text(aes(label = adv)) +
facet_grid(~day) +
scale_fill_brewer(type = 'qual', palette = 2) +
labs(title = "Views and other stuff", y = "% of views", x = "")
Result
set.seed(1)
days <- 5
cats <- 8
dat <- prop.table(matrix(rpois(days * cats, days), cats), 2)
bp1 <- barplot(dat, col = seq(cats))
## some width for rect
rate <- matrix(runif(days * cats, .1, .5), cats)
## calculate xbottom, xtop, ybottom, ytop
bp <- rep(bp1, each = cats)
ybot <- apply(rbind(0, dat), 2, cumsum)[-(cats + 1), ]
ytop <- apply(dat, 2, cumsum)
plot(extendrange(bp1), c(0,1), type = 'n', axes = FALSE, ann = FALSE)
rect(bp - rate, ybot, bp + rate, ytop, col = seq(cats))
text(bp, (ytop + ybot) / 2, LETTERS[seq(cats)])
axis(1, bp1, labels = format(Sys.Date() + seq(days), '%d %b %Y'), lwd = 0)
axis(2)
Probably not very useful, but you can invert the color you are plotting so that you can actually see the labels:
inv_col <- function(color) {
paste0('#', apply(apply(rbind(abs(255 - col2rgb(color))), 2, function(x)
format(as.hexmode(x), 2)), 2, paste, collapse = ''))
}
inv_col(palette())
# [1] "#ffffff" "#00ffff" "#ff32ff" "#ffff00" "#ff0000" "#00ff00" "#0000ff" "#414141"
plot(extendrange(bp1), c(0,1), type = 'n', axes = FALSE, ann = FALSE)
rect(bp - rate, ybot, bp + rate, ytop, col = seq(cats), xpd = NA, border = NA)
text(bp, (ytop + ybot) / 2, LETTERS[seq(cats)], col = inv_col(seq(cats)))
axis(1, bp1, labels = format(Sys.Date() + seq(days), '%d %B\n%Y'), lwd = 0)
axis(2)