Question
Below code is grouped vbar chart example from bokeh documentation.
There are something i can't understand on this example.
Where 'cyl_mfr' is come from in factor_cmap() and vbar()?
'mpg_mean' , is it calculating the mean of 'mpg' column? if then,
why 'mpg_sum' doesn't work?
I want to make my own vbar chart like this example.
Code
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
from bokeh.palettes import Spectral5
from bokeh.sampledata.autompg import autompg_clean as df
from bokeh.transform import factor_cmap
output_file("bars.html")
df.cyl = df.cyl.astype(str)
df.yr = df.yr.astype(str)
group = df.groupby(('cyl', 'mfr'))
source = ColumnDataSource(group)
index_cmap = factor_cmap('cyl_mfr', palette=Spectral5,
factors=sorted(df.cyl.unique()), end=1)
p = figure(plot_width=800, plot_height=300, title="Mean MPG by # Cylinders
and Manufacturer",
x_range=group, toolbar_location=None, tools="")
p.vbar(x='cyl_mfr', top='mpg_mean', width=1, source=source,
line_color="white", fill_color=index_cmap, )
p.y_range.start = 0
p.x_range.range_padding = 0.05
p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Manufacturer grouped by # Cylinders"
p.xaxis.major_label_orientation = 1.2
p.outline_line_color = None
p.add_tools(HoverTool(tooltips=[("MPG", "#mpg_mean"), ("Cyl, Mfr",
"#cyl_mfr")]))
show(p)
The group = df.groupby(('cyl', 'mfr')) makes a <pandas.core.groupby.DataFrameGroupBy object at 0x0xxx>. If you pass this to a ColumnDataSource, bokeh does a lot of magic, and calculates a lot of statistics already
df.columns
Index(['mpg', 'cyl', 'displ', 'hp', 'weight', 'accel', 'yr', 'origin', 'name', 'mfr'],
source.column_names
['accel_count', 'accel_mean', 'accel_std', 'accel_min',
'accel_25%', 'accel_50%', 'accel_75%', 'accel_max', 'displ_count',
'displ_mean', 'displ_std', 'displ_min', 'displ_25%', 'displ_50%',
'displ_75%', 'displ_max', 'hp_count', 'hp_mean', 'hp_std',
'hp_min', 'hp_25%', 'hp_50%', 'hp_75%', 'hp_max', 'mpg_count',
'mpg_mean', 'mpg_std', 'mpg_min', 'mpg_25%', 'mpg_50%',
'mpg_75%', 'mpg_max', 'weight_count', 'weight_mean', 'weight_std',
'weight_min', 'weight_25%', 'weight_50%', 'weight_75%',
'weight_max', 'yr_count', 'yr_mean', 'yr_std', 'yr_min',
'yr_25%', 'yr_50%', 'yr_75%', 'yr_max', 'cyl_mfr']
the cyl_mfr is the labels of the 2 columns on which you grouped by concatenated. In source this has become a column of tuples
mpg_sum is not calculated. If you cant the sum, you will need to calculate that yourself.
Related
I just discovered Bokeh recently, and I try to display a legend for each day of week (represented by 'startdate_dayweek'). The legend should contain the color for each row corresponding to each day.
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.io import output_file
from bokeh.palettes import Set1_7
output_file("conso_daily.html")
treatcriteria_data_global = pd.read_csv(r"treatcriteria_evolution.csv", sep=';')
final_global_data = treatcriteria_data_global.groupby(['startdate_weekyear','startdate_dayweek'],as_index = False).sum().pivot('startdate_weekyear','startdate_dayweek').fillna(0)
numlines = len(final_global_data.columns)
palette = Set1_7[0:numlines]
ts_list_of_list = []
for i in range(0,len(final_global_data.columns)):
ts_list_of_list.append(final_global_data.index)
vals_list_of_list = final_global_data.values.T.tolist()
p = figure(width=500, height=300)
p.left[0].formatter.use_scientific = False
p.multi_line(ts_list_of_list, vals_list_of_list,
legend='startdate_dayweek',
line_color = palette,
line_width=4)
show(p)
But I don't have the expected result in the legend:
How to have the legend for each day? Is the problem due to the fact that I created a MultiIndex table? Thanks.
The multi_line() function can take the parameter legend_field or legend_group. Both are working very well for your usecase, if you use a ColumnDataSource as source. Keep in mind, that a error will come if you use both parameters at the same time.
Minimal Example
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
output_notebook()
source = ColumnDataSource(dict(
xs=[[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]],
ys=[[1,2,3,4,5],[1,1,1,1,5],[5,4,3,2,1]],
legend =['red', 'green', 'blue'],
line_color = ['red', 'green', 'blue']))
p = figure(width=500, height=300)
p.multi_line(xs='xs',
ys='ys',
legend_field ='legend',
line_color = 'line_color',
source=source,
line_width=4)
show(p)
Output
I have a dataframe that details sales of various product categories vs. time. I'd like to make a "line and marker" plot of sales vs. time, per category. To my surprise, this appears to be very difficult in Bokeh.
The scatter plot is easy. But then trying to overplot a line of sales vs. date with the same source (so I can update both scatter and line plots in one go when the source updates) and in such a way that the colors of the line match the colors of the scatter plot markers proves near impossible.
Minimal reproducible example with contrived data:
import pandas as pd
df = pd.DataFrame({'Date':['2020-01-01','2020-01-02','2020-01-01','2020-01-02'],\
'Product Category':['shoes','shoes','grocery','grocery'],\
'Sales':[100,180,21,22],'Colors':['red','red','green','green']})
df['Date'] = pd.to_datetime(df['Date'])
from bokeh.io import output_notebook
output_notebook()
from bokeh.io import output_file, show
from bokeh.plotting import figure
source = ColumnDataSource(df)
plot = figure(x_axis_type="datetime", plot_width=800, toolbar_location=None)
plot.scatter(x="Date",y="Sales",size=15, source=source, fill_color="Colors", fill_alpha=0.5, \
line_color="Colors",legend="Product Category")
for cat in list(set(source.data['Product Category'])):
tmp = source.to_df()
col = tmp[tmp['Product Category']==cat]['Colors'].values[0]
plot.line(x="Date",y="Sales",source=source, line_color=col)
show(plot)
Here's what it looks like, which is clearly wrong:
Here's what I want and don't know how to make:
Can Bokeh not make such plots, where scatter markers and lines have the same color per category, with a legend?
With bokeh it is often helpful to first think about the visualisation you want and then structuring the data source appropriately. You want two lines, on per category, the x axis is time and y axis is the sales. Then a natural way to structure your data source is the following:
df = pd.DataFrame({'Date':['2020-01-01','2020-01-02'],
'Shoe Sales':[100, 180],
'Grocery Sales': [21, 22]
})
from bokeh.io import output_notebook
output_notebook()
from bokeh.io import output_file, show
from bokeh.plotting import figure
source = ColumnDataSource(df)
plot = figure(x_axis_type="datetime", plot_width=800, toolbar_location=None)
categories = ["Shoe Sales", "Grocery Sales"]
colors = {"Shoe Sales": "red", "Grocery Sales": "green"}
for category in categories:
plot.scatter(x="Date",y=category,size=15, source=source, fill_color=colors[category], legend=category)
plot.line(x="Date",y=category,source=source, line_color=colors[category])
show(plot)
The solutions is to group your data. Then you can plot lines for each group.
Minimal Example
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
output_notebook()
df = pd.DataFrame({'Date':['2020-01-01','2020-01-02','2020-01-01','2020-01-02'],
'Product Category':['shoes','shoes','grocery','grocery'],
'Sales':[100,180,21,22],'Colors':['red','red','green','green']})
df['Date'] = pd.to_datetime(df['Date'])
plot = figure(x_axis_type="datetime",
plot_width=400,
plot_height=400,
toolbar_location=None
)
plot.scatter(x="Date",
y="Sales",
size=15,
source=df,
fill_color="Colors",
fill_alpha=0.5,
line_color="Colors",
legend_field="Product Category"
)
for color in df['Colors'].unique():
plot.line(x="Date", y="Sales", source=df[df['Colors']==color], line_color=color)
show(plot)
Output
I want to add labels with the values above the bars like here: How to add data labels to a bar chart in Bokeh? but don't know how to do it. My code looks different then other examples, the code is working but maybe it is not the right way.
My code:
from bokeh.io import export_png
from bokeh.io import output_file, show
from bokeh.palettes import Spectral5
from bokeh.plotting import figure
from bokeh.sampledata.autompg import autompg_clean as df
from bokeh.transform import factor_cmap
from bokeh.models import ColumnDataSource, ranges, LabelSet, Label
import pandas as pd
d = {'lvl': ["lvl1", "lvl2", "lvl2", "lvl3"],
'feature': ["test1", "test2","test3","test4"],
'count': ["5", "20","8", "90"]}
dfn = pd.DataFrame(data=d)
sourceframe = ColumnDataSource(data=dfn)
groupn = dfn.groupby(by=['lvl', 'feature'])
index_cmapn = factor_cmap('lvl_feature', palette=Spectral5, factors=sorted(dfn.lvl.unique()), end=1)
pn = figure(plot_width=800, plot_height=300, title="Count",x_range=groupn, toolbar_location=None)
labels = LabelSet(x='feature', y='count', text='count', level='glyph',x_offset=0, y_offset=5, source=sourceframe, render_mode='canvas',)
pn.vbar(x='lvl_feature', top="count_top" ,width=1, source=groupn,line_color="white", fill_color=index_cmapn, )
pn.y_range.start = 0
pn.x_range.range_padding = 0.05
pn.xgrid.grid_line_color = None
pn.xaxis.axis_label = "levels"
pn.xaxis.major_label_orientation = 1.2
pn.outline_line_color = None
pn.add_layout(labels)
export_png(pn, filename="color.png")
I think it has something to do with my dfn.groupby(by=['lvl', 'feature']) and the (probably wrong) sourceframe = ColumnDataSource(data=dfn).
The plot at this moment:
You can add the groups names in the initial dictionary like this:
d = {'lvl': ["lvl1", "lvl2", "lvl2", "lvl3"],
'feature': ["test1", "test2","test3","test4"],
'count': ["5", "20","8", "90"],
'groups': [('lvl1', 'test1'), ('lvl2', 'test2'), ('lvl2', 'test3'), ('lvl3', 'test4')]}
And then call LabelSet using as x values the groups.
labels = LabelSet(x='groups', y='count', text='count', level='glyph',x_offset=20, y_offset=0, source=sourceframe, render_mode='canvas',)
In this way the labels appear. Note that I played a bit with the offset to check if that was the problem, you can fix that manually.
Here is a snippet plotting some vBars (jupyter notebook):
import random
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, Range1d
from bokeh.models.glyphs import VBar
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
# data
data = {'x': [], 'y': [], 'z': []}
for i in range(1, 10+1):
data['x'].append(i)
data['y'].append(random.randint(1, 100))
data['z'].append(random.uniform(1.00, 1000.00))
source = ColumnDataSource(data)
xdr = FactorRange(factors=[str(x) for x in data['x']])
ydr = Range1d(start=0, end=max(data['y'])*1.5)
f = figure(x_range=xdr, y_range=ydr, plot_width=1000, plot_height=300, tools='',
toolbar_location='above', title='title', outline_line_color='gray')
glyph = VBar(x='x', top='y', bottom=0,
width=0.8, fill_color='blue')
f.add_glyph(source, glyph)
f.add_tools(HoverTool(
tooltips=[
('time', '$x{0}'),
('value', '#' + 'y' + '{0}'),
('money', '#z')
],
mode='vline'
))
output_notebook()
show(f)
After passing the x_range && y_range, the vertical bars misalign with the ticker position:-
In normal case without the x_range && y_range, it works fine:-
I wonder what is the parameter governing the vbar position? Why they 'moved' after receiving custom ticker names?
It misaligned because of the FactorRange. Not exactly sure why... I replaced this by using the min and max values of the ColumnDataSource and this works fine.
import random
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, Range1d
from bokeh.models.glyphs import VBar
from bokeh.plotting import figure
from bokeh.io import show
# data
data = {'x': [], 'y': [], 'z': []}
for i in range(1, 10+1):
data['x'].append(i)
data['y'].append(random.randint(1, 100))
data['z'].append(random.uniform(1.00, 1000.00))
source = ColumnDataSource(data)
ydr = Range1d(start=0, end=max(data['y'])*1.5)
f = figure(x_range=(min(source.data['x'])-0.5, max(source.data['x'])+0.5), y_range=ydr, plot_width=1000, plot_height=300, tools='', toolbar_location='above', title='title', outline_line_color='gray')
glyph = VBar(x='x', top='y', bottom=0,
width=0.8, fill_color='blue')
f.add_glyph(source, glyph)
f.add_tools(HoverTool(
tooltips=[
('time', '$x{0}'),
('value', '#' + 'y' + '{0}'),
('money', '#z')
],
mode='vline'
))
show(f)
I also came accross this issue and noticed the cause behind this issue:
Your data suggests a numeric x axis while the factor range is initialized as a categorical axis (your are using strings).
If you want to have a categorical axis your data needs to be adjusted accordingly
data['x'].append(str(i))
instead of
data['x'].append(i)
After seeing the capabilities of Bokeh I started working with it. Now I am trying to make a Vbar with my dataset.
my dataset (10 rows)
dataset
I have read the tutorial quite a number of times and used the example provided by the official documentation:
https://hub.mybinder.org/user/bokeh-bokeh-notebooks-ate6xt0k/notebooks/tutorial/07%20-%20Bar%20and%20Categorical%20Data%20Plots.ipynb
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6
fruits = ['Apples', 'Pears', 'Nectarines', 'Plums', 'Grapes', 'Strawberries']
counts = [5, 3, 4, 2, 4, 6]
source = ColumnDataSource(data=dict(fruits=fruits, counts=counts, color=Spectral6))
p = figure(x_range=fruits, plot_height=250, y_range=(0, 9), title="Fruit Counts")
p.vbar(x='fruits', top='counts', width=0.9, color='color', legend="fruits", source=source)
p.xgrid.grid_line_color = None
p.legend.orientation = "horizontal"
p.legend.location = "top_center"
show(p)
This I tried to replicate with my own dataset.
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6
source = ColumnDataSource(top_ten_start)
p = figure(x_range='Bank_name', plot_height=250, y_range=(0, 90), title="BAnks")
p.vbar(x='Bank_name', top='Tier_1_ratio', width=0.9, legend="test", source=source)
p.xgrid.grid_line_color = None
p.legend.orientation = "horizontal"
p.legend.location = "top_center"
show(p)
I expected to see a bar chart as shown on the tutorial but nothing is plotting.
I thought by replacing the input of the "x_range", "plot" and "x" it would be enough to work.
Perhaps the following information would help:
These are the dtypes:
Country_code object
Bank_name object
Tier_1_ratio float64
dtype: object
x_range expects a list of categorical values but you supplied a string. This would be okay if it was a glyph and you were using a source, but this is not a glyph. I changed this variable to source.data['Bank_name'] so it uses the bank names in your ColumnDataSource.
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show
from bokeh.palettes import Spectral7
import pandas as pd
top_ten_start = pd.read_csv('top_ten_start.csv')
top_ten_start['color'] = Spectral7
source = ColumnDataSource(top_ten_start)
p = figure(x_range=source.data['Bank_name'], plot_height=750, y_range=(0, 90), title="Banks")
p.vbar(x='Bank_name', top='Tier_1_ratio', width=0.9, legend='Bank_name', source=source, color='color')
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 45
show(p)