Python Data Visualization (plot, chart) with multiple squares [duplicate]

Python Data Visualization (plot, chart) with multiple squares [duplicate] - python

Something like this:
There is a very good package to do it in R. In python, the best that I could figure out is this, using the squarify package (inspired by a post on how to do treemaps):
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns # just to have better line color and width
import squarify
# for those using jupyter notebooks
%matplotlib inline
df = pd.DataFrame({
'v1': np.ones(100),
'v2': np.random.randint(1, 4, 100)})
df.sort_values(by='v2', inplace=True)
# color scale
cmap = mpl.cm.Accent
mini, maxi = df['v2'].min(), df['v2'].max()
norm = mpl.colors.Normalize(vmin=mini, vmax=maxi)
colors = [cmap(norm(value)) for value in df['v2']]
# figure
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
ax = squarify.plot(df['v1'], color=colors, ax=ax)
ax.set_xticks([])
ax.set_yticks([]);
But when I create not 100 but 200 elements (or other non-square numbers), the squares become misaligned.
Another problem is that if I change v2 to some categorical variable (e.g., a hundred As, Bs, Cs and Ds), I get this error:
could not convert string to float: 'a'
So, could anyone help me with these two questions:
how can I solve the alignment problem with non-square numbers of observations?
how can use categorical variables in v2?
Beyond this, I am really open if there are any other python packages that can create waffle plots more efficiently.

I spent a few days to build a more general solution, PyWaffle.
You can install it through
pip install pywaffle
The source code: https://github.com/gyli/PyWaffle
PyWaffle does not use matshow() method, but builds those squares one by one. That makes it easier for customization. Besides, what it provides is a custom Figure class, which returns a figure object. By updating attributes of the figure, you can basically control everything in the chart.
Some examples:
Colored or transparent background:
import matplotlib.pyplot as plt
from pywaffle import Waffle
data = {'Democratic': 48, 'Republican': 46, 'Libertarian': 3}
fig = plt.figure(
FigureClass=Waffle,
rows=5,
values=data,
colors=("#983D3D", "#232066", "#DCB732"),
title={'label': 'Vote Percentage in 2016 US Presidential Election', 'loc': 'left'},
labels=["{0} ({1}%)".format(k, v) for k, v in data.items()],
legend={'loc': 'lower left', 'bbox_to_anchor': (0, -0.4), 'ncol': len(data), 'framealpha': 0}
)
fig.gca().set_facecolor('#EEEEEE')
fig.set_facecolor('#EEEEEE')
plt.show()
Use icons replacing squares:
data = {'Democratic': 48, 'Republican': 46, 'Libertarian': 3}
fig = plt.figure(
FigureClass=Waffle,
rows=5,
values=data,
colors=("#232066", "#983D3D", "#DCB732"),
legend={'loc': 'upper left', 'bbox_to_anchor': (1, 1)},
icons='child', icon_size=18,
icon_legend=True
)
Multiple subplots in one chart:
import pandas as pd
data = pd.DataFrame(
{
'labels': ['Hillary Clinton', 'Donald Trump', 'Others'],
'Virginia': [1981473, 1769443, 233715],
'Maryland': [1677928, 943169, 160349],
'West Virginia': [188794, 489371, 36258],
},
).set_index('labels')
fig = plt.figure(
FigureClass=Waffle,
plots={
'311': {
'values': data['Virginia'] / 30000,
'labels': ["{0} ({1})".format(n, v) for n, v in data['Virginia'].items()],
'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.05, 1), 'fontsize': 8},
'title': {'label': '2016 Virginia Presidential Election Results', 'loc': 'left'}
},
'312': {
'values': data['Maryland'] / 30000,
'labels': ["{0} ({1})".format(n, v) for n, v in data['Maryland'].items()],
'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.2, 1), 'fontsize': 8},
'title': {'label': '2016 Maryland Presidential Election Results', 'loc': 'left'}
},
'313': {
'values': data['West Virginia'] / 30000,
'labels': ["{0} ({1})".format(n, v) for n, v in data['West Virginia'].items()],
'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.3, 1), 'fontsize': 8},
'title': {'label': '2016 West Virginia Presidential Election Results', 'loc': 'left'}
},
},
rows=5,
colors=("#2196f3", "#ff5252", "#999999"), # Default argument values for subplots
figsize=(9, 5) # figsize is a parameter of plt.figure
)

I've put together a working example, below, which I think meets your needs. Some work is needed to fully generalize the approach, but I think you'll find that it's a good start. The trick was to use matshow() to solve your non-square problem, and to build a custom legend to easily account for categorical values.
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
# Let's make a default data frame with catagories and values.
df = pd.DataFrame({ 'catagories': ['cat1', 'cat2', 'cat3', 'cat4'],
'values': [84911, 14414, 10062, 8565] })
# Now, we define a desired height and width.
waffle_plot_width = 20
waffle_plot_height = 7
classes = df['catagories']
values = df['values']
def waffle_plot(classes, values, height, width, colormap):
# Compute the portion of the total assigned to each class.
class_portion = [float(v)/sum(values) for v in values]
# Compute the number of tiles for each catagories.
total_tiles = width * height
tiles_per_class = [round(p*total_tiles) for p in class_portion]
# Make a dummy matrix for use in plotting.
plot_matrix = np.zeros((height, width))
# Popoulate the dummy matrix with integer values.
class_index = 0
tile_index = 0
# Iterate over each tile.
for col in range(waffle_plot_width):
for row in range(height):
tile_index += 1
# If the number of tiles populated is sufficient for this class...
if tile_index > sum(tiles_per_class[0:class_index]):
# ...increment to the next class.
class_index += 1
# Set the class value to an integer, which increases with class.
plot_matrix[row, col] = class_index
# Create a new figure.
fig = plt.figure()
# Using matshow solves your "non-square" problem.
plt.matshow(plot_matrix, cmap=colormap)
plt.colorbar()
# Get the axis.
ax = plt.gca()
# Minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True);
ax.set_yticks(np.arange(-.5, (height), 1), minor=True);
# Gridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
# Manually constructing a legend solves your "catagorical" problem.
legend_handles = []
for i, c in enumerate(classes):
lable_str = c + " (" + str(values[i]) + ")"
color_val = colormap(float(i+1)/len(classes))
legend_handles.append(mpatches.Patch(color=color_val, label=lable_str))
# Add the legend. Still a bit of work to do here, to perfect centering.
plt.legend(handles=legend_handles, loc=1, ncol=len(classes),
bbox_to_anchor=(0., -0.1, 0.95, .10))
plt.xticks([])
plt.yticks([])
# Call the plotting function.
waffle_plot(classes, values, waffle_plot_height, waffle_plot_width,
plt.cm.coolwarm)
Below is an example of the output this script produced. As you can see, it works fairly well for me, and meets all of your stated needs. Just let me know if it gives you any trouble. Enjoy!

You can use this function for automatic creation of a waffle with simple parameters:
def create_waffle_chart(categories, values, height, width, colormap, value_sign=''):
# compute the proportion of each category with respect to the total
total_values = sum(values)
category_proportions = [(float(value) / total_values) for value in values]
# compute the total number of tiles
total_num_tiles = width * height # total number of tiles
print ('Total number of tiles is', total_num_tiles)
# compute the number of tiles for each catagory
tiles_per_category = [round(proportion * total_num_tiles) for proportion in category_proportions]
# print out number of tiles per category
for i, tiles in enumerate(tiles_per_category):
print (df_dsn.index.values[i] + ': ' + str(tiles))
# initialize the waffle chart as an empty matrix
waffle_chart = np.zeros((height, width))
# define indices to loop through waffle chart
category_index = 0
tile_index = 0
# populate the waffle chart
for col in range(width):
for row in range(height):
tile_index += 1
# if the number of tiles populated for the current category
# is equal to its corresponding allocated tiles...
if tile_index > sum(tiles_per_category[0:category_index]):
# ...proceed to the next category
category_index += 1
# set the class value to an integer, which increases with class
waffle_chart[row, col] = category_index
# instantiate a new figure object
fig = plt.figure()
# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()
# get the axis
ax = plt.gca()
# set minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
# add dridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
plt.xticks([])
plt.yticks([])
# compute cumulative sum of individual categories to match color schemes between chart and legend
values_cumsum = np.cumsum(values)
total_values = values_cumsum[len(values_cumsum) - 1]
# create legend
legend_handles = []
for i, category in enumerate(categories):
if value_sign == '%':
label_str = category + ' (' + str(values[i]) + value_sign + ')'
else:
label_str = category + ' (' + value_sign + str(values[i]) + ')'
color_val = colormap(float(values_cumsum[i])/total_values)
legend_handles.append(mpatches.Patch(color=color_val, label=label_str))
# add legend to chart
plt.legend(
handles=legend_handles,
loc='lower center',
ncol=len(categories),
bbox_to_anchor=(0., -0.2, 0.95, .1)
)

Related

How to generate labelled barplots using seaborn?

I am a bit new to Python. And I am playing with a dummy dataset to get some Python data manipulation practice. Below is the code for generating the dummy data:
d = {
'SeniorCitizen': [0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ,
'CollegeDegree': [0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1] ,
'Married': [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ,
'FulltimeJob': [1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,0,0,1,1,0,0,0,1] ,
'DistancefromBranch': [7,9,14,20,21,12,22,25,9,9,9,12,13,14,16,25,27,4,14,14,20,19,15,23,2] ,
'ReversedPayment': [0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0] }
CarWash = pd.DataFrame(data = d)
categoricals = ['SeniorCitizen','CollegeDegree','Married','FulltimeJob','ReversedPayment']
numerical = ['DistancefromBranch']
CarWash[categoricals] = CarWash[categoricals].astype('category')
I am basically struggling with a couple of things:
#1. A stacked barplot with absolute values (like the excel example below)
#2. A stacked barplot with percentage values (like the excel example below)
Below are my target visualizations for # 1 and # 2 using countplot().
#1
#2
For # 1, instead of a stacked barplot, with countplot() I am able to make a clustered barplot, like below, and also the annotation snippet feels more like a workaround rather than being Python elegant.
# Looping through each categorical column and viewing target variable distribution (ReversedPayment) by value
figure, axes = plt.subplots(2,2,figsize = (10,10))
for i,ax in zip(categoricals[:-1],axes.flatten()):
sns.countplot(x= i, hue = 'ReversedPayment', data = CarWash, ax = ax)
for p in ax.patches:
height = np.nan_to_num(p.get_height()) # gets the height of each patch/bar
adjust = np.nan_to_num(p.get_width())/2 # a calculation for adusting the data label later
label_xy = (np.nan_to_num(p.get_x()) + adjust,np.nan_to_num(p.get_height()) + adjust) #x,y coordinates where we want to put our data label
ax.annotate(height,label_xy) # final annotation
For # 2, I tried creating a new data frame housing % values but that felt tedious and error-prone.
I feel an option like stacked = True, proportion = True, axis = 1, annotate = True could have been so useful for countplot() to have.
Are there any other libraries that would be straight-froward and less code-intensive? Any comments or suggestions are welcome.

In this case, I think plotly.express may be more intuitive for you.
import plotly.express as px
df_temp = CarWash.groupby(['SeniorCitizen', 'ReversedPayment'])['DistancefromBranch'].count().reset_index().rename({'DistancefromBranch':'count'}, axis=1)
fig = px.bar(df_temp, x="SeniorCitizen", y="count", color="ReversedPayment", title="SeniorCitizen", text='count')
fig.update_traces(textposition='inside')
fig.show()
Basically, if you want to have more flexibility to adjust your charts, it is hard to avoid writing lots of codes.
I also try using matplotlib and pandas to create a stacked bar chart for percentages. If you are interested in it, you can try it.
sns.set()
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=[12,8], dpi=100)
# Conver the axes matrix to a 1-d array
axes = ax.flatten()
for i, col in enumerate(['SeniorCitizen', 'CollegeDegree', 'Married', 'FulltimeJob']):
# Calculate the number of plots
df_temp = (CarWash.groupby(col)['ReversedPayment']
.value_counts()
.unstack(1).fillna(0)
.rename({0:f'No', 1:f'Yes'})
.rename({0:'No', 1:'Yes'}, axis=1))
df_temp = df_temp / df_temp.sum(axis=0)
df_temp.plot.bar(stacked=True, ax=axes[i])
axes[i].set_title(col, y=1.03, fontsize=16)
rects = axes[i].patches
labels = df_temp.values.flatten()
for rect, label in zip(rects, labels):
if label == 0: continue
axes[i].text(rect.get_x() + rect.get_width() / 2, rect.get_y() + rect.get_height() / 3, '{:.2%}'.format(label),
ha='center', va='bottom', color='white', fontsize=12)
axes[i].legend(title='Reversed\nPayment', bbox_to_anchor=(1.05, 1), loc='upper left', title_fontsize = 10, fontsize=10)
axes[i].tick_params(rotation=0)
plt.tight_layout()
plt.show()

Creating box plots by looping multiple columns

I am trying to create multiple box plot charts for about 5 columns in my dataframe (df_summ):
columns = ['dimension_a','dimension_b']
for i in columns:
sns.set(style = "ticks", palette = "pastel")
box_plot = sns.boxplot(y="measure", x=i,
palette=["m","g"],
data=df_summ_1500_delta)
sns.despine(offset=10, trim=True)
medians = df_summ_1500_delta.groupby([i])['measure'].median()
vertical_offset=df_summ_1500_delta['measure'].median()*-0.5
for xtick in box_plot.get_xticks():
box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
horizontalalignment='center',size='small',color='blue',weight='semibold')
My only issue is that they aren't be separated on different facets, but rather on top of each other.
Any help on how I can make both on their own separate chart with the x axis being 'dimension a' and the x axis of the second chart being 'dimension b'.

To draw two boxplots next to each other at each x-position, you can use a hue for dimension_a and dimension_b separately. These two columns need to be transformed (with pd.melt()) to "long form".
Here is a some example code starting from generated test data. Note that the order both for the x-values as for the hue-values needs to be enforced to be sure of their exact position. The individual box plots are distributed over a width of 0.8.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
df = pd.DataFrame({'dimension_a': np.random.choice(['hot', 'cold'], 100),
'dimension_b': np.random.choice(['hot', 'cold'], 100),
'measure': np.random.uniform(100, 500, 100)})
df.loc[df['dimension_a'] == 'hot', 'measure'] += 100
df.loc[df['dimension_a'] == 'cold', 'measure'] -= 100
x_order = ['hot', 'cold']
columns = ['dimension_a', 'dimension_b']
df1 = df.melt(value_vars=columns, var_name='dimension', value_name='value', id_vars='measure')
sns.set(style="ticks", palette="pastel")
ax = sns.boxplot(data=df1, x='value', order=x_order, y='measure',
hue='dimension', hue_order=columns, palette=["m", "g"], dodge=True)
ax.set_xlabel('')
sns.despine(offset=10, trim=True)
for col, dodge_dist in zip(columns, np.linspace(-0.4, 0.4, 2 * len(x_order) + 1)[1::2]):
medians = df.groupby([col])['measure'].median()
vertical_offset = df['measure'].median() * -0.5
for x_ind, xtick in enumerate(x_order):
ax.text(x_ind + dodge_dist, medians[xtick] + vertical_offset, f'{medians[xtick]:.2f}',
horizontalalignment='center', size='small', color='blue', weight='semibold')
plt.show()

Heatmap with circles indicating size of population

I would like to produce a heatmap in Python, similar to the one shown, where the size of the circle indicates the size of the sample in that cell. I looked in seaborn's gallery and couldn't find anything, and I don't think I can do this with matplotlib.

It's the inverse. While matplotlib can do pretty much everything, seaborn only provides a small subset of options.
So using matplotlib, you can plot a PatchCollection of circles as shown below.
Note: You could equally use a scatter plot, but since scatter dot sizes are in absolute units it would be rather hard to scale them into the grid.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
N = 10
M = 11
ylabels = ["".join(np.random.choice(list("PQRSTUVXYZ"), size=7)) for _ in range(N)]
xlabels = ["".join(np.random.choice(list("ABCDE"), size=3)) for _ in range(M)]
x, y = np.meshgrid(np.arange(M), np.arange(N))
s = np.random.randint(0, 180, size=(N,M))
c = np.random.rand(N, M)-0.5
fig, ax = plt.subplots()
R = s/s.max()/2
circles = [plt.Circle((j,i), radius=r) for r, j, i in zip(R.flat, x.flat, y.flat)]
col = PatchCollection(circles, array=c.flatten(), cmap="RdYlGn")
ax.add_collection(col)
ax.set(xticks=np.arange(M), yticks=np.arange(N),
xticklabels=xlabels, yticklabels=ylabels)
ax.set_xticks(np.arange(M+1)-0.5, minor=True)
ax.set_yticks(np.arange(N+1)-0.5, minor=True)
ax.grid(which='minor')
fig.colorbar(col)
plt.show()

Here's a possible solution using Bokeh Plots:
import pandas as pd
from bokeh.palettes import RdBu
from bokeh.models import LinearColorMapper, ColumnDataSource, ColorBar
from bokeh.models.ranges import FactorRange
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
import numpy as np
output_notebook()
d = dict(x = ['A','A','A', 'B','B','B','C','C','C','D','D','D'],
y = ['B','C','D', 'A','C','D','B','D','A','A','B','C'],
corr = np.random.uniform(low=-1, high=1, size=(12,)).tolist())
df = pd.DataFrame(d)
df['size'] = np.where(df['corr']<0, np.abs(df['corr']), df['corr'])*50
#added a new column to make the plot size
colors = list(reversed(RdBu[9]))
exp_cmap = LinearColorMapper(palette=colors,
low = -1,
high = 1)
p = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width=700,
plot_height=450, title="Correlation",
toolbar_location=None, tools="hover")
p.scatter("x","y",source=df, fill_alpha=1, line_width=0, size="size",
fill_color={"field":"corr", "transform":exp_cmap})
p.x_range.factors = sorted(df['x'].unique().tolist())
p.y_range.factors = sorted(df['y'].unique().tolist(), reverse = True)
p.xaxis.axis_label = 'Values'
p.yaxis.axis_label = 'Values'
bar = ColorBar(color_mapper=exp_cmap, location=(0,0))
p.add_layout(bar, "right")
show(p)

One option is to use matplotlib's scatter plots with legends and grid. You can specify size of those circles with specifying the scales. You can also change the color of each circle. You should somehow specify X,Y values so that the circles sit straight on lines. This is an example I got from here:
volume = np.random.rayleigh(27, size=40)
amount = np.random.poisson(10, size=40)
ranking = np.random.normal(size=40)
price = np.random.uniform(1, 10, size=40)
fig, ax = plt.subplots()
# Because the price is much too small when being provided as size for ``s``,
# we normalize it to some useful point sizes, s=0.3*(price*3)**2
scatter = ax.scatter(volume, amount, c=ranking, s=0.3*(price*3)**2,
vmin=-3, vmax=3, cmap="Spectral")
# Produce a legend for the ranking (colors). Even though there are 40 different
# rankings, we only want to show 5 of them in the legend.
legend1 = ax.legend(*scatter.legend_elements(num=5),
loc="upper left", title="Ranking")
ax.add_artist(legend1)
# Produce a legend for the price (sizes). Because we want to show the prices
# in dollars, we use the *func* argument to supply the inverse of the function
# used to calculate the sizes from above. The *fmt* ensures to show the price
# in dollars. Note how we target at 5 elements here, but obtain only 4 in the
# created legend due to the automatic round prices that are chosen for us.
kw = dict(prop="sizes", num=5, color=scatter.cmap(0.7), fmt="$ {x:.2f}",
func=lambda s: np.sqrt(s/.3)/3)
legend2 = ax.legend(*scatter.legend_elements(**kw),
loc="lower right", title="Price")
plt.show()
Output:

I don't have enough reputation to comment on Delenges' excellent answer, so I'll leave my comment as an answer instead:
R.flat doesn't order the way we need it to, so the circles assignment should be:
circles = [plt.Circle((j,i), radius=R[j][i]) for j, i in zip(x.flat, y.flat)]

Here is an easy example to plot circle_heatmap.
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.datasets import load_wine as load_data
from psynlig import plot_correlation_heatmap
plt.style.use('seaborn-talk')
data_set = load_data()
data = pd.DataFrame(data_set['data'], columns=data_set['feature_names'])
#data = df_corr_selected
kwargs = {
'heatmap': {
'vmin': -1,
'vmax': 1,
'cmap': 'viridis',
},
'figure': {
'figsize': (14, 10),
},
}
plot_correlation_heatmap(data, bubble=True, annotate=False, **kwargs)
plt.show()

My image will print to PNG but nothing is showing, it does show the image when plot.show is run

My code is as follows. I am using MatPlotLib to create an image, but the image is not rendering in the png. Someone, please fix my code so it will render.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#Sets the size of the chart
from pylab import rcParams
rcParams['figure.figsize'] = 7, 2
#AverageAnnualMedicalCostPerEE = "{}".format('jpg')
#print AverageAnnualMedicalCostPerEE
#Creates the dataframe
raw_data = {'plan_type': ['Total Annual Cost, Single', 'Total Annual Cost,
Family'],
'Your Plan': [6000, 3000],
'Benchmark': [4800, 1600],
'Region': [1800, 2800],
'Industry': [4900, 1300],
'Size': [5700, 1600],
}
data = ['Your Plan','Benchmark','Region','Industry','Size']
df = pd.DataFrame(raw_data,
columns = ['plan_type','Your Plan', 'Benchmark', 'Region',
'Industry', 'Size'])
#Plots the bars, adding desired colors
ax = df.plot.bar(rot=0, color=['#ffc000',"#305496", '#8ea9db', '#b4c6e7',
'#D9E1F2'],
width = 0.8 )
#Adds data labels to top of bars
for p in ax.patches[0:]:
h = p.get_height()
x = p.get_x()+p.get_width()/2.
if h != 0:
ax.annotate( '$' + "%g" % p.get_height(), xy=(x,h), xytext=(0,4),
rotation=0,
textcoords="offset points", ha="center", va="bottom",
fontsize='small', color='grey')
# Remove Bordering Frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#B4C7E7')
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
# Remove Y-axis Labels
ax.axes.get_yaxis().set_visible(False)
#Sets x-axis limits and margins
ax.set_xlim(-0.5, +1.5)
ax.margins(y=0)
# Set Y-Axis Ticks to the Right
ax.yaxis.tick_right()
# Set the Y Axis Limits
ax.set_ylim(0,(df[['Your
Plan','Benchmark','Region','Industry','Size']].max(axis=1).max(axis=0)*1.5))
ax.margins(y=0)
#Adds legend to the top of the page
ax.legend(ncol=len(df.columns), loc="Lower Left", bbox_to_anchor=
(0,1.02,1,0.08),
borderaxespad=0, mode="expand",frameon=False, fontsize='small')
#Add labels to the x-axis
ax.set_xticklabels(df["plan_type"], fontsize='small')
ax.xaxis.set_ticks_position('none')
#shows the plot and prints it to
plt.show()
plt.savefig('AverageAnnualMedicalCostPerEE.png')
So, again I am looking to get a png I can then later import into a table and add to my story. The latter is easy, except the image rendering issue. Please let me know if you can solve this, probably a quick fix.

I think you should change the order of saving the image and showing it, since the figure will be reset after the plt.show(). So you should be able to fix this by either removing the plt.show() command from your code or switch plt.savefig(...) and plt.show().

matplotlib: Group boxplots

Is there a way to group boxplots in matplotlib?
Assume we have three groups "A", "B", and "C" and for each we want to create a boxplot for both "apples" and "oranges". If a grouping is not possible directly, we can create all six combinations and place them linearly side by side. What would be to simplest way to visualize the groupings? I'm trying to avoid setting the tick labels to something like "A + apples" since my scenario involves much longer names than "A".

How about using colors to differentiate between "apples" and "oranges" and spacing to separate "A", "B" and "C"?
Something like this:
from pylab import plot, show, savefig, xlim, figure, \
hold, ylim, legend, boxplot, setp, axes
# function for setting the colors of the box plots pairs
def setBoxColors(bp):
setp(bp['boxes'][0], color='blue')
setp(bp['caps'][0], color='blue')
setp(bp['caps'][1], color='blue')
setp(bp['whiskers'][0], color='blue')
setp(bp['whiskers'][1], color='blue')
setp(bp['fliers'][0], color='blue')
setp(bp['fliers'][1], color='blue')
setp(bp['medians'][0], color='blue')
setp(bp['boxes'][1], color='red')
setp(bp['caps'][2], color='red')
setp(bp['caps'][3], color='red')
setp(bp['whiskers'][2], color='red')
setp(bp['whiskers'][3], color='red')
setp(bp['fliers'][2], color='red')
setp(bp['fliers'][3], color='red')
setp(bp['medians'][1], color='red')
# Some fake data to plot
A= [[1, 2, 5,], [7, 2]]
B = [[5, 7, 2, 2, 5], [7, 2, 5]]
C = [[3,2,5,7], [6, 7, 3]]
fig = figure()
ax = axes()
hold(True)
# first boxplot pair
bp = boxplot(A, positions = [1, 2], widths = 0.6)
setBoxColors(bp)
# second boxplot pair
bp = boxplot(B, positions = [4, 5], widths = 0.6)
setBoxColors(bp)
# thrid boxplot pair
bp = boxplot(C, positions = [7, 8], widths = 0.6)
setBoxColors(bp)
# set axes limits and labels
xlim(0,9)
ylim(0,9)
ax.set_xticklabels(['A', 'B', 'C'])
ax.set_xticks([1.5, 4.5, 7.5])
# draw temporary red and blue lines and use them to create a legend
hB, = plot([1,1],'b-')
hR, = plot([1,1],'r-')
legend((hB, hR),('Apples', 'Oranges'))
hB.set_visible(False)
hR.set_visible(False)
savefig('boxcompare.png')
show()

Here is my version. It stores data based on categories.
import matplotlib.pyplot as plt
import numpy as np
data_a = [[1,2,5], [5,7,2,2,5], [7,2,5]]
data_b = [[6,4,2], [1,2,5,3,2], [2,3,5,1]]
ticks = ['A', 'B', 'C']
def set_box_color(bp, color):
plt.setp(bp['boxes'], color=color)
plt.setp(bp['whiskers'], color=color)
plt.setp(bp['caps'], color=color)
plt.setp(bp['medians'], color=color)
plt.figure()
bpl = plt.boxplot(data_a, positions=np.array(xrange(len(data_a)))*2.0-0.4, sym='', widths=0.6)
bpr = plt.boxplot(data_b, positions=np.array(xrange(len(data_b)))*2.0+0.4, sym='', widths=0.6)
set_box_color(bpl, '#D7191C') # colors are from http://colorbrewer2.org/
set_box_color(bpr, '#2C7BB6')
# draw temporary red and blue lines and use them to create a legend
plt.plot([], c='#D7191C', label='Apples')
plt.plot([], c='#2C7BB6', label='Oranges')
plt.legend()
plt.xticks(xrange(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, len(ticks)*2)
plt.ylim(0, 8)
plt.tight_layout()
plt.savefig('boxcompare.png')
I am short of reputation so I cannot post an image to here.
You can run it and see the result. Basically it's very similar to what Molly did.
Note that, depending on the version of python you are using, you may need to replace xrange with range

A simple way would be to use pandas.
I adapted an example from the plotting documentation:
In [1]: import pandas as pd, numpy as np
In [2]: df = pd.DataFrame(np.random.rand(12,2), columns=['Apples', 'Oranges'] )
In [3]: df['Categories'] = pd.Series(list('AAAABBBBCCCC'))
In [4]: pd.options.display.mpl_style = 'default'
In [5]: df.boxplot(by='Categories')
Out[5]:
array([<matplotlib.axes.AxesSubplot object at 0x51a5190>,
<matplotlib.axes.AxesSubplot object at 0x53fddd0>], dtype=object)

Mock data:
df = pd.DataFrame({'Group':['A','A','A','B','C','B','B','C','A','C'],\
'Apple':np.random.rand(10),'Orange':np.random.rand(10)})
df = df[['Group','Apple','Orange']]
Group Apple Orange
0 A 0.465636 0.537723
1 A 0.560537 0.727238
2 A 0.268154 0.648927
3 B 0.722644 0.115550
4 C 0.586346 0.042896
5 B 0.562881 0.369686
6 B 0.395236 0.672477
7 C 0.577949 0.358801
8 A 0.764069 0.642724
9 C 0.731076 0.302369
You can use the Seaborn library for these plots. First melt the dataframe to format data and then create the boxplot of your choice.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dd=pd.melt(df,id_vars=['Group'],value_vars=['Apple','Orange'],var_name='fruits')
sns.boxplot(x='Group',y='value',data=dd,hue='fruits')

The accepted answer uses pylab and works for 2 groups. What if we have more?
Here is the flexible generic solution with matplotlib
import matplotlib.pyplot as pl
# there are 4 individuals, each one tested under 3 different settings
# --- Random data, e.g. results per algorithm:
# Invidual 1
d1_1 = [1,1,2,2,3,3]
d1_2 = [3,3,4,4,5,5]
d1_3 = [5,5,6,6,7,7]
# Individual 2
d2_1 = [7,7,8,8,9,9]
d2_2 = [9,9,10,10,11,11]
d2_3 = [11,11,12,12,13,13]
# Individual 3
d3_1 = [1,2,3,4,5,6]
d3_2 = [4,5,6,7,8,9]
d3_3 = [10,11,12,13,14,15]
# Individual 4
d4_1 = [1,1,2,2,3,3]
d4_2 = [9,9,10,10,11,11]
d4_3 = [10,11,12,13,14,15]
# --- Combining your data:
data_group1 = [d1_1, d1_2, d1_3]
data_group2 = [d2_1, d2_2, d2_3]
data_group3 = [d3_1, d3_2, d3_3]
data_group4 = [d4_1, d4_2, d4_3]
colors = ['pink', 'lightblue', 'lightgreen', 'violet']
# we compare the performances of the 4 individuals within the same set of 3 settings
data_groups = [data_group1, data_group2, data_group3, data_group4]
# --- Labels for your data:
labels_list = ['a','b', 'c']
width = 1/len(labels_list)
xlocations = [ x*((1+ len(data_groups))*width) for x in range(len(data_group1)) ]
symbol = 'r+'
ymin = min ( [ val for dg in data_groups for data in dg for val in data ] )
ymax = max ( [ val for dg in data_groups for data in dg for val in data ])
ax = pl.gca()
ax.set_ylim(ymin,ymax)
ax.grid(True, linestyle='dotted')
ax.set_axisbelow(True)
pl.xlabel('X axis label')
pl.ylabel('Y axis label')
pl.title('title')
space = len(data_groups)/2
offset = len(data_groups)/2
# --- Offset the positions per group:
group_positions = []
for num, dg in enumerate(data_groups):
_off = (0 - space + (0.5+num))
print(_off)
group_positions.append([x+_off*(width+0.01) for x in xlocations])
for dg, pos, c in zip(data_groups, group_positions, colors):
boxes = ax.boxplot(dg,
sym=symbol,
labels=['']*len(labels_list),
# labels=labels_list,
positions=pos,
widths=width,
boxprops=dict(facecolor=c),
# capprops=dict(color=c),
# whiskerprops=dict(color=c),
# flierprops=dict(color=c, markeredgecolor=c),
medianprops=dict(color='grey'),
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
patch_artist=True,
)
ax.set_xticks( xlocations )
ax.set_xticklabels( labels_list, rotation=0 )
pl.show()

Just to add to the conversation, I have found a more elegant way to change the color of the box plot by iterating over the dictionary of the object itself
import numpy as np
import matplotlib.pyplot as plt
def color_box(bp, color):
# Define the elements to color. You can also add medians, fliers and means
elements = ['boxes','caps','whiskers']
# Iterate over each of the elements changing the color
for elem in elements:
[plt.setp(bp[elem][idx], color=color) for idx in xrange(len(bp[elem]))]
return
a = np.random.uniform(0,10,[100,5])
bp = plt.boxplot(a)
color_box(bp, 'red')
Cheers!

Here's a function I wrote that takes Molly's code and some other code I've found on the internet to make slightly fancier grouped boxplots:
import numpy as np
import matplotlib.pyplot as plt
def custom_legend(colors, labels, linestyles=None):
""" Creates a list of matplotlib Patch objects that can be passed to the legend(...) function to create a custom
legend.
:param colors: A list of colors, one for each entry in the legend. You can also include a linestyle, for example: 'k--'
:param labels: A list of labels, one for each entry in the legend.
"""
if linestyles is not None:
assert len(linestyles) == len(colors), "Length of linestyles must match length of colors."
h = list()
for k,(c,l) in enumerate(zip(colors, labels)):
clr = c
ls = 'solid'
if linestyles is not None:
ls = linestyles[k]
patch = patches.Patch(color=clr, label=l, linestyle=ls)
h.append(patch)
return h
def grouped_boxplot(data, group_names=None, subgroup_names=None, ax=None, subgroup_colors=None,
box_width=0.6, box_spacing=1.0):
""" Draws a grouped boxplot. The data should be organized in a hierarchy, where there are multiple
subgroups for each main group.
:param data: A dictionary of length equal to the number of the groups. The key should be the
group name, the value should be a list of arrays. The length of the list should be
equal to the number of subgroups.
:param group_names: (Optional) The group names, should be the same as data.keys(), but can be ordered.
:param subgroup_names: (Optional) Names of the subgroups.
:param subgroup_colors: A list specifying the plot color for each subgroup.
:param ax: (Optional) The axis to plot on.
"""
if group_names is None:
group_names = data.keys()
if ax is None:
ax = plt.gca()
plt.sca(ax)
nsubgroups = np.array([len(v) for v in data.values()])
assert len(np.unique(nsubgroups)) == 1, "Number of subgroups for each property differ!"
nsubgroups = nsubgroups[0]
if subgroup_colors is None:
subgroup_colors = list()
for k in range(nsubgroups):
subgroup_colors.append(np.random.rand(3))
else:
assert len(subgroup_colors) == nsubgroups, "subgroup_colors length must match number of subgroups (%d)" % nsubgroups
def _decorate_box(_bp, _d):
plt.setp(_bp['boxes'], lw=0, color='k')
plt.setp(_bp['whiskers'], lw=3.0, color='k')
# fill in each box with a color
assert len(_bp['boxes']) == nsubgroups
for _k,_box in enumerate(_bp['boxes']):
_boxX = list()
_boxY = list()
for _j in range(5):
_boxX.append(_box.get_xdata()[_j])
_boxY.append(_box.get_ydata()[_j])
_boxCoords = zip(_boxX, _boxY)
_boxPolygon = plt.Polygon(_boxCoords, facecolor=subgroup_colors[_k])
ax.add_patch(_boxPolygon)
# draw a black line for the median
for _k,_med in enumerate(_bp['medians']):
_medianX = list()
_medianY = list()
for _j in range(2):
_medianX.append(_med.get_xdata()[_j])
_medianY.append(_med.get_ydata()[_j])
plt.plot(_medianX, _medianY, 'k', linewidth=3.0)
# draw a black asterisk for the mean
plt.plot([np.mean(_med.get_xdata())], [np.mean(_d[_k])], color='w', marker='*',
markeredgecolor='k', markersize=12)
cpos = 1
label_pos = list()
for k in group_names:
d = data[k]
nsubgroups = len(d)
pos = np.arange(nsubgroups) + cpos
label_pos.append(pos.mean())
bp = plt.boxplot(d, positions=pos, widths=box_width)
_decorate_box(bp, d)
cpos += nsubgroups + box_spacing
plt.xlim(0, cpos-1)
plt.xticks(label_pos, group_names)
if subgroup_names is not None:
leg = custom_legend(subgroup_colors, subgroup_names)
plt.legend(handles=leg)
You can use the function(s) like this:
data = { 'A':[np.random.randn(100), np.random.randn(100) + 5],
'B':[np.random.randn(100)+1, np.random.randn(100) + 9],
'C':[np.random.randn(100)-3, np.random.randn(100) -5]
}
grouped_boxplot(data, group_names=['A', 'B', 'C'], subgroup_names=['Apples', 'Oranges'], subgroup_colors=['#D02D2E', '#D67700'])
plt.show()

Grouped boxplots, towards subtle academic publication styling... (source)
(Left) Python 2.7.12 Matplotlib v1.5.3. (Right) Python 3.7.3. Matplotlib v3.1.0.
Code:
import numpy as np
import matplotlib.pyplot as plt
# --- Your data, e.g. results per algorithm:
data1 = [5,5,4,3,3,5]
data2 = [6,6,4,6,8,5]
data3 = [7,8,4,5,8,2]
data4 = [6,9,3,6,8,4]
# --- Combining your data:
data_group1 = [data1, data2]
data_group2 = [data3, data4]
# --- Labels for your data:
labels_list = ['a','b']
xlocations = range(len(data_group1))
width = 0.3
symbol = 'r+'
ymin = 0
ymax = 10
ax = plt.gca()
ax.set_ylim(ymin,ymax)
ax.set_xticklabels( labels_list, rotation=0 )
ax.grid(True, linestyle='dotted')
ax.set_axisbelow(True)
ax.set_xticks(xlocations)
plt.xlabel('X axis label')
plt.ylabel('Y axis label')
plt.title('title')
# --- Offset the positions per group:
positions_group1 = [x-(width+0.01) for x in xlocations]
positions_group2 = xlocations
plt.boxplot(data_group1,
sym=symbol,
labels=['']*len(labels_list),
positions=positions_group1,
widths=width,
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
# patch_artist=False,
)
plt.boxplot(data_group2,
labels=labels_list,
sym=symbol,
positions=positions_group2,
widths=width,
# notch=False,
# vert=True,
# whis=1.5,
# bootstrap=None,
# usermedians=None,
# conf_intervals=None,
# patch_artist=False,
)
plt.savefig('boxplot_grouped.png')
plt.savefig('boxplot_grouped.pdf') # when publishing, use high quality PDFs
#plt.show() # uncomment to show the plot.

I used the code given by Kuzeko and it worked well, but I found that the boxes in each group were being drawn in the reverse order. I changed ...x-_off... to ...x+_off... in the following line (just above the last for loop) which fixes it for me:
group_positions.append([x+_off*(width+0.01) for x in xlocations])

A boxplot above was modified to obtain group boxplots with 3 data types.
import matplotlib.pyplot as plt
import numpy as np
ord = [[16.9423,
4.0410,
19.1185],
[18.5134,
17.8048,
19.2669],
[18.7286,
18.0576,
19.1717],
[18.8998,
18.8469,
19.0005],
[18.8126,
18.7870,
18.8393],
[18.7770,
18.7511,
18.8022],
[18.7409,
18.7075,
18.7747],
[18.6866,
18.6624,
18.7093
],
[18.6748],
[18.9069,
18.6752,
19.0769],
[19.0012,
18.9783,
19.0202
],
[18.9448,
18.9134,
18.9813],
[19.1242,
18.8256,
19.3185],
[19.2118,
19.1661,
19.2580],
[19.2505,
19.1231,
19.3526]]
seq = [[17.8092,
4.0410,
19.6653],
[18.7266,
18.2556,
19.3739],
[18.6051,
18.0589,
19.0557],
[18.6467,
18.5629,
18.7566],
[18.5307,
18.4999,
18.5684],
[18.4732,
18.4484,
18.4985],
[18.5234,
18.5027,
18.4797,
18.4573],
[18.3987,
18.3636,
18.4544],
[18.3593],
[18.7234,
18.7092,
18.7598],
[18.7438,
18.7224,
18.7677],
[18.7304,
18.7111,
18.6880,
18.6913,
18.6678],
[18.8926,
18.5902,
19.2003],
[19.1059,
19.0835,
19.0601,
19.0373,
19.0147],
[19.1925,
19.0177,
19.2588]]
apd=[[17.0331,
4.0410,
18.5670],
[17.6124,
17.1975,
18.0755],
[17.3956,
17.1572,
17.9140],
[17.8295,
17.6514,
18.1466],
[18.0665,
17.9144,
18.2157],
[18.1518,
18.0382,
18.2722],
[18.1975,
18.0956,
18.2987],
[18.2219,
18.1293,
18.3062],
[18.2870,
18.2215,
18.3513],
[18.3047,
18.2363,
18.3950],
[18.3580,
18.2923,
18.4205],
[18.3830,
18.3250,
18.4381],
[18.4135,
18.3645,
18.4753],
[18.4580,
18.4095,
18.5170],
[18.4900,
18.4430,
18.5435]
]
ticks = [120,
240,
360,
516,
662,
740,
874,
1022,
1081,
1201,
1320,
1451,
1562,
1680,
1863]
def set_box_color(bp, color):
plt.setp(bp['boxes'], color=color)
plt.setp(bp['whiskers'], color=color)
plt.setp(bp['caps'], color=color)
plt.setp(bp['medians'], color=color)
plt.figure()
bpl = plt.boxplot(ord, positions=np.array(range(len(ord)))*3.0-0.3, sym='', widths=0.6)
bpr = plt.boxplot(seq, positions=np.array(range(len(seq)))*3.0+0.3, sym='', widths=0.6)
bpg = plt.boxplot(apd, positions=np.array(range(len(apd)))*3.0+0.9, sym='', widths=0.6)
set_box_color(bpl, '#D7191C') # colors are from http://colorbrewer2.org/
set_box_color(bpr, '#2C7BB6')
set_box_color(bpg, '#99d8c9')
# draw temporary red and blue lines and use them to create a legend
plt.plot([], c='#D7191C', label='ORD')
plt.plot([], c='#2C7BB6', label='SEQ')
plt.plot([], c='#99d8c9', label='APD')
plt.legend()
plt.xticks(range(0, len(ticks) * 3, 3), ticks)
plt.xlim(-2, len(ticks)*3)
plt.ylim(0, 20)
plt.tight_layout()
plt.show()
plt.savefig('boxcompare.png')

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Data Visualization (plot, chart) with multiple squares [duplicate] - python

Related

How to generate labelled barplots using seaborn?

Creating box plots by looping multiple columns

Heatmap with circles indicating size of population

My image will print to PNG but nothing is showing, it does show the image when plot.show is run

matplotlib: Group boxplots

Categories

Resources