I was able to create two linked plots using holoviews + bokeh backend, basically following this code example.
Here's an example of code from the reference:
import pandas as pd
import numpy as np
import holoviews as hv
import seaborn as sns
from holoviews import opts
hv.extension('bokeh', width=90)
# Declare dataset
df = sns.load_dataset('tips')
df = df[['total_bill', 'tip', 'size']]
# Declare HeatMap
corr = df.corr()
heatmap = hv.HeatMap((corr.columns, corr.index, corr))
# Declare Tap stream with heatmap as source and initial values
posxy = hv.streams.Tap(source=heatmap, x='total_bill', y='tip')
# Define function to compute histogram based on tap location
def tap_histogram(x, y):
m, b = np.polyfit(df[x], df[y], deg=1)
x_data = np.linspace(df.tip.min(), df.tip.max())
y_data = m*x_data + b
right = (hv.Curve((x_data, y_data), x, y)
* hv.Scatter((df[x], df[y]), x, y))
right.opts(opts.Scatter(
height=400, width=400, color='red', ylim=(0, 100),
framewise=True, tools=['hover']))
return right
tap_dmap = hv.DynamicMap(tap_histogram, streams=[posxy])
(heatmap + tap_dmap).opts(
opts.HeatMap(tools=['tap', 'hover'],
height=400, width=400, toolbar='above'),
opts.Curve(framewise=True))
Now, I wanna create a hover tool specifying the different parameters on the dependent plot.
So far I am only being able to use the default hover (.opts(tools['hover'])) as in the code above.
When I try to build a custom hover to dynamically change the fields based on x and y streamed values, it does not update the hover after tapping on the heatmap. It only keeps the initial values of x and y.
Here's an example of my current code:
Try to tap in total_bil x size, for example.
import pandas as pd
import numpy as np
import holoviews as hv
import seaborn as sns
from holoviews import opts
from bokeh.models import HoverTool
hv.extension('bokeh', width=90)
# Declare dataset
df = sns.load_dataset('tips')
df = df[['total_bill', 'tip', 'size']]
# Declare HeatMap
corr = df.corr()
heatmap = hv.HeatMap((corr.columns, corr.index, corr))
# Declare Tap stream with heatmap as source and initial values
posxy = hv.streams.Tap(source=heatmap, x='total_bill', y='tip')
# Define function to compute histogram based on tap location
def tap_histogram(x, y):
m, b = np.polyfit(df[x], df[y], deg=1)
x_data = np.linspace(df.tip.min(), df.tip.max())
y_data = m*x_data + b
right = (hv.Curve((x_data, y_data), x, y)
* hv.Scatter((df[x], df[y]), x, y))
tooltips = [(x, '#'+x),
(y, '#'+y)
]
hover = HoverTool(tooltips=tooltips)
right.opts(opts.Scatter(
height=400, width=400, color='red', ylim=(0, 100),
framewise=True, tools=[hover]))
return right
tap_dmap = hv.DynamicMap(tap_histogram, streams=[posxy])
(heatmap + tap_dmap).opts(
opts.HeatMap(tools=['tap', 'hover'],
height=400, width=400, toolbar='above'),
opts.Curve(framewise=True))
Related
I'm getting this error:
TypeError: Object of type Interval is not JSON serializable
Here is my code.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from bokeh.io import output_file, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.models import NumeralTickFormatter
def construct_labels(start, end):
labels = []
for index, x in enumerate(start):
y = end[index]
labels.append('({}, {}]'.format(x, y))
return labels
values = {'Length': np.random.uniform(0, 4, 10)}
df = pd.DataFrame(values, columns=['Length'])
bin_step_size = 0.5
# List of bin points.
p_bins = np.arange(0, (df['Length'].max() + bin_step_size), bin_step_size)
# Reduce the tail to create the left side bounds.
p_left_limits = p_bins[:-1].copy()
# Cut the head to create the right side bounds.
p_right_limits = np.delete(p_bins, 0)
# Create the bins.
p_range_bins = pd.IntervalIndex.from_arrays(p_left_limits, p_right_limits)
# Create labels.
p_range_labels = construct_labels(p_left_limits, p_right_limits)
p_ranges_binned = pd.cut(
df['Length'],
p_range_bins,
labels=p_range_labels,
precision=0,
include_lowest=True)
out = p_ranges_binned
counts = out.value_counts(sort=False)
total_element_count = len(df.index)
foo = pd.DataFrame({'bins': counts.index, 'counts': counts})
foo.reset_index(drop=True, inplace=True)
foo['percent'] = foo['counts'].apply(lambda x: x / total_element_count)
foo['percent_full'] = foo['counts'].apply(lambda x: x / total_element_count * 100)
bin_labels = p_range_labels
# Data Container
source = ColumnDataSource(dict(
bins=foo['bins'],
percent=foo['percent'],
count=foo['counts'],
labels=pd.DataFrame({'labels': bin_labels})
))
p = figure(x_range=bin_labels, plot_height=600, plot_width=1200, title="Range Counts",
toolbar_location=None, tools="")
p.vbar(x='labels', top='percent', width=0.9, source=source)
p.yaxis[0].formatter = NumeralTickFormatter(format="0.0%")
p.xaxis.major_label_orientation = math.pi / 2
p.xgrid.grid_line_color = None
p.y_range.start = 0
output_file("bars.html")
show(p)
The error comes from here:
source = ColumnDataSource(dict(
bins=foo['bins'],
percent=foo['percent'],
count=foo['counts'],
labels=pd.DataFrame({'labels': bin_labels})
))
The bins you passed in is a interval type that cannot be JSON serialized.
After review your code, this bins variable is not used in your plotting, so you can change it to:
source = ColumnDataSource(dict(
percent=foo['percent'],
count=foo['counts'],
labels=bin_labels
))
Notice that I also changed your labels to bin_labels, which is a list and ColumnDataSource can use list as input. But you may want to further format these labels, as right now they are just like
['(0.0, 0.5]',
'(0.5, 1.0]',
'(1.0, 1.5]',
'(1.5, 2.0]',
'(2.0, 2.5]',
'(2.5, 3.0]',
'(3.0, 3.5]',
'(3.5, 4.0]']
You might want to format them to something prettier.
After this small change you should be able to see your bar graph:
I would like to produce a heatmap in Python, similar to the one shown, where the size of the circle indicates the size of the sample in that cell. I looked in seaborn's gallery and couldn't find anything, and I don't think I can do this with matplotlib.
It's the inverse. While matplotlib can do pretty much everything, seaborn only provides a small subset of options.
So using matplotlib, you can plot a PatchCollection of circles as shown below.
Note: You could equally use a scatter plot, but since scatter dot sizes are in absolute units it would be rather hard to scale them into the grid.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
N = 10
M = 11
ylabels = ["".join(np.random.choice(list("PQRSTUVXYZ"), size=7)) for _ in range(N)]
xlabels = ["".join(np.random.choice(list("ABCDE"), size=3)) for _ in range(M)]
x, y = np.meshgrid(np.arange(M), np.arange(N))
s = np.random.randint(0, 180, size=(N,M))
c = np.random.rand(N, M)-0.5
fig, ax = plt.subplots()
R = s/s.max()/2
circles = [plt.Circle((j,i), radius=r) for r, j, i in zip(R.flat, x.flat, y.flat)]
col = PatchCollection(circles, array=c.flatten(), cmap="RdYlGn")
ax.add_collection(col)
ax.set(xticks=np.arange(M), yticks=np.arange(N),
xticklabels=xlabels, yticklabels=ylabels)
ax.set_xticks(np.arange(M+1)-0.5, minor=True)
ax.set_yticks(np.arange(N+1)-0.5, minor=True)
ax.grid(which='minor')
fig.colorbar(col)
plt.show()
Here's a possible solution using Bokeh Plots:
import pandas as pd
from bokeh.palettes import RdBu
from bokeh.models import LinearColorMapper, ColumnDataSource, ColorBar
from bokeh.models.ranges import FactorRange
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
import numpy as np
output_notebook()
d = dict(x = ['A','A','A', 'B','B','B','C','C','C','D','D','D'],
y = ['B','C','D', 'A','C','D','B','D','A','A','B','C'],
corr = np.random.uniform(low=-1, high=1, size=(12,)).tolist())
df = pd.DataFrame(d)
df['size'] = np.where(df['corr']<0, np.abs(df['corr']), df['corr'])*50
#added a new column to make the plot size
colors = list(reversed(RdBu[9]))
exp_cmap = LinearColorMapper(palette=colors,
low = -1,
high = 1)
p = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width=700,
plot_height=450, title="Correlation",
toolbar_location=None, tools="hover")
p.scatter("x","y",source=df, fill_alpha=1, line_width=0, size="size",
fill_color={"field":"corr", "transform":exp_cmap})
p.x_range.factors = sorted(df['x'].unique().tolist())
p.y_range.factors = sorted(df['y'].unique().tolist(), reverse = True)
p.xaxis.axis_label = 'Values'
p.yaxis.axis_label = 'Values'
bar = ColorBar(color_mapper=exp_cmap, location=(0,0))
p.add_layout(bar, "right")
show(p)
One option is to use matplotlib's scatter plots with legends and grid. You can specify size of those circles with specifying the scales. You can also change the color of each circle. You should somehow specify X,Y values so that the circles sit straight on lines. This is an example I got from here:
volume = np.random.rayleigh(27, size=40)
amount = np.random.poisson(10, size=40)
ranking = np.random.normal(size=40)
price = np.random.uniform(1, 10, size=40)
fig, ax = plt.subplots()
# Because the price is much too small when being provided as size for ``s``,
# we normalize it to some useful point sizes, s=0.3*(price*3)**2
scatter = ax.scatter(volume, amount, c=ranking, s=0.3*(price*3)**2,
vmin=-3, vmax=3, cmap="Spectral")
# Produce a legend for the ranking (colors). Even though there are 40 different
# rankings, we only want to show 5 of them in the legend.
legend1 = ax.legend(*scatter.legend_elements(num=5),
loc="upper left", title="Ranking")
ax.add_artist(legend1)
# Produce a legend for the price (sizes). Because we want to show the prices
# in dollars, we use the *func* argument to supply the inverse of the function
# used to calculate the sizes from above. The *fmt* ensures to show the price
# in dollars. Note how we target at 5 elements here, but obtain only 4 in the
# created legend due to the automatic round prices that are chosen for us.
kw = dict(prop="sizes", num=5, color=scatter.cmap(0.7), fmt="$ {x:.2f}",
func=lambda s: np.sqrt(s/.3)/3)
legend2 = ax.legend(*scatter.legend_elements(**kw),
loc="lower right", title="Price")
plt.show()
Output:
I don't have enough reputation to comment on Delenges' excellent answer, so I'll leave my comment as an answer instead:
R.flat doesn't order the way we need it to, so the circles assignment should be:
circles = [plt.Circle((j,i), radius=R[j][i]) for j, i in zip(x.flat, y.flat)]
Here is an easy example to plot circle_heatmap.
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.datasets import load_wine as load_data
from psynlig import plot_correlation_heatmap
plt.style.use('seaborn-talk')
data_set = load_data()
data = pd.DataFrame(data_set['data'], columns=data_set['feature_names'])
#data = df_corr_selected
kwargs = {
'heatmap': {
'vmin': -1,
'vmax': 1,
'cmap': 'viridis',
},
'figure': {
'figsize': (14, 10),
},
}
plot_correlation_heatmap(data, bubble=True, annotate=False, **kwargs)
plt.show()
I am trying to plot a few points on a graph, similarly to a heat map.
Sample code (adapted from the heat map section here):
import pandas as pd
from bokeh.io import output_notebook, show
from bokeh.models import BasicTicker, ColorBar, ColumnDataSource, LinearColorMapper, PrintfTickFormatter
from bokeh.plotting import figure
from bokeh.transform import transform
import numpy as np
# change this if you don't run it on a Jupyter Notebook
output_notebook()
testx = np.random.randint(0,10,10)
testy = np.random.randint(0,10,10)
npdata = np.stack((testx,testy), axis = 1)
hist, bins = np.histogramdd(npdata, normed = False, bins = (10,10), range=((0,10),(0,10)))
data = pd.DataFrame(hist, columns = [str(x) for x in range(10)])
data.columns.name = 'y'
data['x'] = [str(x) for x in range(10)]
data = data.set_index('x')
df = pd.DataFrame(data.stack(), columns=['present']).reset_index()
source = ColumnDataSource(df)
colors = ['lightblue', "yellow"]
mapper = LinearColorMapper(palette=colors, low=df.present.min(), high=df.present.max())
p = figure(plot_width=400, plot_height=400, title="test circle map",
x_range=list(data.index), y_range=list((data.columns)),
toolbar_location=None, tools="", x_axis_location="below")
p.circle(x="x", y="y", size=20, source=source,
line_color=None, fill_color=transform('present', mapper))
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "10pt"
p.axis.major_label_standoff = 10
p.xaxis.major_label_orientation = 0
show(p)
That returns:
Now, as you can see, the grid lines are centered on the points(circles), and I would like, instead to have the circles enclosed in a square created by the lines.
I went through this to see if I could find information on how to offset the grid lines by 0.5 (that would have worked), but I was not able to.
There's nothing built into Bokeh to accomplish this kind of offsetting of categorical ticks, but you can write a custom extension to do it:
CS_CODE = """
import {CategoricalTicker} from "models/tickers/categorical_ticker"
export class MyTicker extends CategoricalTicker
type: "MyTicker"
get_ticks: (start, end, range, cross_loc) ->
ticks = super(start, end, range, cross_loc)
# shift the default tick locations by half a categorical bin width
ticks.major = ([x, 0.5] for x in ticks.major)
return ticks
"""
class MyTicker(CategoricalTicker):
__implementation__ = CS_CODE
p.xgrid.ticker = MyTicker()
p.ygrid.ticker = MyTicker()
Note that Bokeh assumes CoffeeScript by default when the code is just a string, but it's possible to use pure JS or TypeScript as well. Adding this to your code yields:
Please note the comment about output_notebook you must call it (possibly again, if you have called it previously) after the custom model is defined, due to #6107
I need to change the colors of the boxplot drawn using pandas utility function. I can change most properties using the color argument but can't figure out how to change the facecolor of the box. Someone knows how to do it?
import pandas as pd
import numpy as np
data = np.random.randn(100, 4)
labels = list("ABCD")
df = pd.DataFrame(data, columns=labels)
props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray")
df.plot.box(color=props)
While I still recommend seaborn and raw matplotlib over the plotting interface in pandas, it turns out that you can pass patch_artist=True as a kwarg to df.plot.box, which will pass it as a kwarg to df.plot, which will pass is as a kwarg to matplotlib.Axes.boxplot.
import pandas as pd
import numpy as np
data = np.random.randn(100, 4)
labels = list("ABCD")
df = pd.DataFrame(data, columns=labels)
props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray")
df.plot.box(color=props, patch_artist=True)
As suggested, I ended up creating a function to plot this, using raw matplotlib.
def plot_boxplot(data, ax):
bp = ax.boxplot(data.values, patch_artist=True)
for box in bp['boxes']:
box.set(color='DarkGreen')
box.set(facecolor='DarkGreen')
for whisker in bp['whiskers']:
whisker.set(color="DarkOrange")
for cap in bp['caps']:
cap.set(color="Gray")
for median in bp['medians']:
median.set(color="white")
ax.axhline(0, color="DarkBlue", linestyle=":")
ax.set_xticklabels(data.columns)
I suggest using df.plot.box with patch_artist=True and return_type='both' (which returns the matplotlib axes the boxplot is drawn on and a dictionary whose values are the matplotlib Lines of the boxplot) in order to have the best customization possibilities.
For example, given this data:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(
data=np.random.randn(100, 4),
columns=list("ABCD")
)
you can set a specific color for all the boxes:
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch in props['boxes']:
patch.set_facecolor('lime')
plt.show()
you can set a specific color for each box:
colors = ['green','blue','yellow','red']
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch,color in zip(props['boxes'],colors):
patch.set_facecolor(color)
plt.show()
you can easily integrate a colormap:
colors = np.random.randint(0,10, 4)
cm = plt.cm.get_cmap('rainbow')
colors_cm = [cm((c-colors.min())/(colors.max()-colors.min())) for c in colors]
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch,color in zip(props['boxes'],colors_cm):
patch.set_facecolor(color)
# to add colorbar
fig.colorbar(plt.cm.ScalarMappable(
plt.cm.colors.Normalize(min(colors),max(colors)),
cmap='rainbow'
), ax=ax, cmap='rainbow')
plt.show()
I have a series of data that I'm reading in from a tutorial site.
I've managed to plot the distribution of the TV column in that data, however I also want to overlay a normal distribution curve with StdDev ticks on a second x-axis (so I can compare the two curves). I'm struggling to work out how to do it..
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
# draw distribution curve
h = sorted(data.TV)
hmean = np.mean(h)
hstd = np.std(h)
pdf = stats.norm.pdf(h, hmean, hstd)
plt.plot(h, pdf)
Here is a diagram close to what I'm after, where x is the StdDeviations. All this example needs is a second x axis to show the values of data.TV
Not sure what you really want, but you could probably use second axis like this
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('Advertising.csv', index_col=0)
fig, ax1 = plt.subplots()
# draw distribution curve
h = sorted(data.TV)
ax1.plot(h,'b-')
ax1.set_xlabel('TV')
ax1.set_ylabel('Count', color='b')
for tl in ax1.get_yticklabels():
tl.set_color('b')
hmean = np.mean(h)
hstd = np.std(h)
pdf = stats.norm.pdf(h, hmean, hstd)
ax2 = ax1.twinx()
ax2.plot(h, pdf, 'r.')
ax2.set_ylabel('pdf', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
plt.show()
Ok, assuming that you want to plot the distribution of your data, the fitted normal distribution with two x-axes, one way to achieve this is as follows.
Plot the normalized data together with the standard normal distribution. Then use matplotlib's twiny() to add a second x-axis to the plot. Use the same tick positions as the original x-axis on the second axis, but scale the labels so that you get the corresponding original TV values. The result looks like this:
Code
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
h = sorted(data.TV)
hmean = np.mean(h)
hstd = np.std(h)
h_n = (h - hmean) / hstd
pdf = stats.norm.pdf( h_n )
# plot data
f,ax1 = plt.subplots()
ax1.hist( h_n, 20, normed=1 )
ax1.plot( h_n , pdf, lw=3, c='r')
ax1.set_xlim( [h_n.min(), h_n.max()] )
ax1.set_xlabel( r'TV $[\sigma]$' )
ax1.set_ylabel( r'Relative Frequency')
ax2 = ax1.twiny()
ax2.grid( False )
ax2.set_xlim( ax1.get_xlim() )
ax2.set_ylim( ax1.get_ylim() )
ax2.set_xlabel( r'TV' )
ticklocs = ax2.xaxis.get_ticklocs()
ticklocs = [ round( t*hstd + hmean, 2) for t in ticklocs ]
ax2.xaxis.set_ticklabels( map( str, ticklocs ) )