How to make multiline graph with matplotlib subplots and pandas? - python

I'm fairly new at coding (completely self taught), and have started using it at at my job as a research assistant in a cancer lab. I need some help setting up a few line graphs in matplot lab.
I have a dataset that includes nextgen sequencing data for about 80 patients. on each patient, we have different timepoints of analysis, different genes detected (out of 40), and the associated %mutation for the gene.
My goal is to write two scripts, one that will generate a "by patient" plot, that will be a linegraph with y-%mutation, x-time of measurement, and will have a different color line for all lines made by each of the patient's associated genes. The second plot will be a "by gene", where I will have one plot contain different color lines that represent each of the different patient's x/y values for that specific gene.
Here is an example dataframe for 1 genenumber for the above script:
gene yaxis xaxis pt# gene#
ASXL1-3 34 1 3 1
ASXL1-3 0 98 3 1
IDH1-3 24 1 3 11
IDH1-3 0 98 3 11
RUNX1-3 38 1 3 21
RUNX1-3 0 98 3 21
U2AF1-3 33 1 3 26
U2AF1-3 0 98 3 26
I have setup a groupby script that when I iterate over it, gives me a dataframe for every gene-timepoint for each patient.
grouped = df.groupby('pt #')
for groupObject in grouped:
group = groupObject[1]
For patient 1, this gives the following output:
y x gene patientnumber patientgene genenumber dxtotransplant \
0 40.0 1712 ASXL1 1 ASXL1-1 1 1857
1 26.0 1835 ASXL1 1 ASXL1-1 1 1857
302 7.0 1835 RUNX1 1 RUNX1-1 21 1857
I need help writing a script that will create either of the plots described above. using the bypatient example, my general idea is that I need to create a different subplot for every gene a patient has, where each subplot is the line graph represented by that one gene.
Using matplotlib this is about as far as I have gotten:
plt.figure()
grouped = df.groupby('patient number')
for groupObject in grouped:
group = groupObject[1]
df = group #may need to remove this
for element in range(len(group)):
xs = np.array(df[df.columns[1]]) #"x" column
ys= np.array(df[df.columns[0]]) #"y" column
gene = np.array(df[df.columns[2]])[element] #"gene" column
plt.subplot(1,1,1)
plt.scatter(xs,ys, label=gene)
plt.plot(xs,ys, label=gene)
plt.legend()
plt.show()
This produces the following output:
In this output, the circled line is not supposed to be connected to the other 2 points. In this case, this is patient 1, who has the following datapoint:
x y gene
1712 40 ASXL1
1835 26 ASXL1
1835 7 RUNX1
Using seaborn I have gotten close to my desired graph using this code:
grouped = df.groupby(['patientnumber'])
for groupObject in grouped:
group = groupObject[1]
g = sns.FacetGrid(group, col="patientgene", col_wrap=4, size=4, ylim=(0,100))
g = g.map(plt.scatter, "x", "y", alpha=0.5)
g = g.map(plt.plot, "x", "y", alpha=0.5)
plt.title= "gene:%s"%element
Using this code I get the following:
If I adjust the line:
g = sns.FacetGrid(group, col="patientnumber", col_wrap=4, size=4, ylim=(0,100))
I get the following result:
As you can see in the 2d example, the plot is treating every point on my plot as if they are from the same line (but they are actually 4 separate lines).
How I can tweak my iterations so that each patient-gene is treated as a separate line on the same graph?

I wrote a subplot function that may give you a hand. I modified the data a tad to help illustrate the plotting functionality.
gene,yaxis,xaxis,pt #,gene #
ASXL1-3,34,1,3,1
ASXL1-3,3,98,3,1
IDH1-3,24,1,3,11
IDH1-3,7,98,3,11
RUNX1-3,38,1,3,21
RUNX1-3,2,98,3,21
U2AF1-3,33,1,3,26
U2AF1-3,0,98,3,26
ASXL1-3,39,1,4,1
ASXL1-3,8,62,4,1
ASXL1-3,0,119,4,1
IDH1-3,27,1,4,11
IDH1-3,12,62,4,11
IDH1-3,1,119,4,11
RUNX1-3,42,1,4,21
RUNX1-3,3,62,4,21
RUNX1-3,1,119,4,21
U2AF1-3,16,1,4,26
U2AF1-3,1,62,4,26
U2AF1-3,0,119,4,26
This is the subplotting function...with some extra bells and whistles :)
def plotByGroup(df, group, xCol, yCol, title = "", xLabel = "", yLabel = "", lineColors = ["red", "orange", "yellow", "green", "blue", "purple"], lineWidth = 2, lineOpacity = 0.7, plotStyle = 'ggplot', showLegend = False):
"""
Plot multiple lines from a Pandas Data Frame for each group using DataFrame.groupby() and MatPlotLib PyPlot.
#params
df - Required - Data Frame - Pandas Data Frame
group - Required - String - Column name to group on
xCol - Required - String - Column name for X axis data
yCol - Required - String - Column name for y axis data
title - Optional - String - Plot Title
xLabel - Optional - String - X axis label
yLabel - Optional - String - Y axis label
lineColors - Optional - List - Colors to plot multiple lines
lineWidth - Optional - Integer - Width of lines to plot
lineOpacity - Optional - Float - Alpha of lines to plot
plotStyle - Optional - String - MatPlotLib plot style
showLegend - Optional - Boolean - Show legend
#return
MatPlotLib Plot Object
"""
# Import MatPlotLib Plotting Function & Set Style
from matplotlib import pyplot as plt
matplotlib.style.use(plotStyle)
figure = plt.figure() # Initialize Figure
grouped = df.groupby(group) # Set Group
i = 0 # Set iteration to determine line color indexing
for idx, grp in grouped:
colorIndex = i % len(lineColors) # Define line color index
lineLabel = grp[group].values[0] # Get a group label from first position
xValues = grp[xCol] # Get x vector
yValues = grp[yCol] # Get y vector
plt.subplot(1,1,1) # Initialize subplot and plot (on next line)
plt.plot(xValues, yValues, label = lineLabel, color = lineColors[colorIndex], lw = lineWidth, alpha = lineOpacity)
# Plot legend
if showLegend:
plt.legend()
i += 1
# Set title & Labels
axis = figure.add_subplot(1,1,1)
axis.set_title(title)
axis.set_xlabel(xLabel)
axis.set_ylabel(yLabel)
# Return plot for saving, showing, etc.
return plt
And to use it...
import pandas
# Load the Data into Pandas
df = pandas.read_csv('data.csv')
#
# Plotting - by Patient
#
# Create Patient Grouping
patientGroup = df.groupby('pt #')
# Iterate Over Groups
for idx, patientDF in patientGroup:
# Let's give them specific titles
plotTitle = "Gene Frequency over Time by Gene (Patient %s)" % str(patientDf['pt #'].values[0])
# Call the subplot function
plot = plotByGroup(patientDf, 'gene', 'xaxis', 'yaxis', title = plotTitle, xLabel = "Days", yLabel = "Gene Frequency")
# Add Vertical Lines at Assay Timepoints
timepoints = set(patientDf.xaxis.values)
[plot.axvline(x = timepoint, linewidth = 1, linestyle = "dashed", color='gray', alpha = 0.4) for timepoint in timepoints]
# Let's see it
plot.show()
And of course, we can do the same by gene.
#
# Plotting - by Gene
#
# Create Gene Grouping
geneGroup = df.groupby('gene')
# Generate Plots for Groups
for idx, geneDF in geneGroup:
plotTitle = "%s Gene Frequency over Time by Patient" % str(geneDf['gene'].values[0])
plot = plotByGroup(geneDf, 'pt #', 'xaxis', 'yaxis', title = plotTitle, xLab = "Days", yLab = "Frequency")
plot.show()
If this isn't what you're looking for, provide a clarification and I'll take another crack at it.

Related

Matplotlib segmented Plot

I have the following dataset:
df = pd.DataFrame ({"a": [1,2,3,4,5,6,7,8,9,1,11,12,13,14,15,16,17,18,19,20],
'b':[1,2,3,4,50,60,70,8,9,10,110,120,130,140,150,16,17,18,19,20],
'c':[np.nan,2.2,3.4,np.nan,40.9,60.2,np.nan,8.2,8.9,10.1,np.nan,120.2,
130.07,140.23,np.nan,16.054,17.20,18.1,np.nan,20.1],
'd': [100, np.nan,np.nan, 500,np.nan, np.nan,500,
np.nan,np.nan,np.nan,100, np.nan,np.nan, np.nan,500,
np.nan,np.nan, np.nan,100,np.nan ]}
)
I am trying to plot the data based on the following conditions:
Between 100 to the next 100 in column 'd' I want to have one
plot having column 'a' in the x axis, and scatterplot of column 'b' and line plot of 'c' in the y axis.
That is I will be having 3 different plots. First one from index 0 to 10, second one from index 10 to index 18, third one from 18 to 20. (I can generate this using for loop)
Within each plot I want segmented lineplot based on the location 500 value in column 'd',i.e., for the first plot from index 0-3 one lineplot, from index 3-6 another and from index 6-10 another lineplot.( I can't make the segmented lineplot)
I am using the following codes:
index = index + [len(df)]
index1 = index1 + [len(df)]
for k in range (len(index)-1):
x = df['a'][index[k] + 1:index[k+1]]
y = df['c'][index[k]+ 1:index[k+1]]
y1 = df['b'][index[k]+ 1:index[k+1]]
plt.scatter(x, y)
plt.plot(x, y1)
plt.savefig('plot'+ str(k+1000) +'.png')
plt.clf()
My first plot look like this: (But want to have three segmented
lineplot not the continuous one (that is line from index 0-3 should not be connected with 3-6 and so on)
Sorry for the rather long question and thx:)
The expected output is unclear, but here is a general strategy to split your dataset in groups with help of groupby:
option 1: independent figures
group = df['d'].eq(100).cumsum()
for name, g in df.groupby(group):
f,ax = plt.subplots()
ax.scatter(g['a'], g['c'])
ax.plot(g['a'], g['b'])
f.savefig(f'figure_{name}.png')
option 2
ax = plt.subplot()
group = df['d'].eq(100).cumsum()
for name, g in df.groupby(group):
ax.scatter(g['a'], g['c'])
ax.plot(g['a'], g['b'], label=name)
ax.legend()
option 3
ax = plt.subplot()
group = df['d'].eq(100).cumsum()
for name, g in df.groupby(group):
g = g.reset_index()
ax.scatter(g.index+1, g['c'])
ax.plot(g.index+1, g['b'])

How to add a condition to legend labelling to show certain entries only

is it possible to create plot labels based on conditions? As one can see below, I am creating plots within a loop. The issue is that not all locations of the dataset contain data to plot as time series. Hence I want to label only those timesteps which contain data. I have 8 locations of which only 4 contain data. Currently, a legend for all locations is plotted:
I would like to do something like:
lineplot = plt.plot(timesteps, skin_celsius, label='%s. storm location' % i with condition if time series contains no NaN data)
is something like this possible?
This is the whole code for the plot
fig2 = plt.figure(figsize=(20, 20), dpi=300)
# change this loop to extract time series data for day -10 + 10days around stormtrack data.
for i, dummy in enumerate(lati):
dsloc = SSTskin_subfile_masked.sel(lon=loni[i], lat=lati[i], method='nearest')
dstime = SSTskin_subfile_masked.sel(time=timei[i], lon=loni[i], lat=lati[i], method='nearest') #find point data within timeseries data
skin_celsius = (dsloc['analysed_sst']) - 273.15
timesteps = dsloc.time.values
timestep = dstime.time.values
timevalue = ((dstime['analysed_sst']).values) - 273.15
#Here I want to add a condition to the labelling.
lineplot = plt.plot(timesteps, skin_celsius, label='%s. storm location' % I)
dotplot = plt.plot(timestep, timevalue, "or")
plt.title('Skin SST over time at storm track locations', fontsize = 20 )
plt.xlabel('Date', fontsize = 16)
plt.ylabel('Skin SST in $^{\circ}C$', fontsize = 16)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
plt.legend()

How to make stackedbarplot with percent description and identical height of columns divided by target in Python Pandas?

I have Data Frame like below (for reference):
target |product
---------|--------
1 |EHZ
1 |GBK
0 |EHZ
0 |AKP
1 |AKP
So I have target variable "target" and nominal variable "product" and I woul like to plot graph like below based on my df, how can I do that? I know only that it is stackedbar, and
I need to have as below that each column have percentage description both for 0 and 1
and columns have identical heoght and they are divided into 1 and 0
Everything in Python Pandas / Matplotlib. Could you show me example code which makes me identical plot based on my data frame ?
I used code created by Rob Raymond like below:
fig, ax = plt.subplots(figsize=(10,3))
# prepare dataframe for plotting
dfp = pd.crosstab(index=df["product"], columns=df["target"]).apply(lambda r: r/r.sum(), axis=1)
# simple stacked plot
ax = dfp.plot(kind="barh", stacked=True, ax=ax)
for c in ax.containers:
# customize the label to account for cases when there might not be a bar section
labels = [f'{w*100:.0f}%' if (w := v.get_width()) > 0 else '' for v in c ]
# set the bar label
ax.bar_label(c, labels=labels, label_type='center')
ax.set_xlabel("procent")
ax.set_title("tytul")
and I have error like below:
From comments
first generate percent totals for each product
then it's a simple case of a horizontal stacked bar
labels in bars stack bar plot in matplotlib and add label to each section
use matplotlib API to set any additional titles and labels as desired
import io
import matplotlib.pyplot as plt
df = pd.read_csv(io.StringIO("""target |product
1 |EHZ
1 |GBK
0 |EHZ
0 |AKP
1 |AKP"""), sep="\s+\|", engine="python")
fig, ax = plt.subplots(figsize=(10,3))
# prepare dataframe for plotting
dfp = pd.crosstab(index=df["product"], columns=df["target"]).apply(lambda r: r/r.sum(), axis=1)
# simple stacked plot
ax = dfp.plot(kind="barh", stacked=True, ax=ax)
for c in ax.containers:
# customize the label to account for cases when there might not be a bar section
labels = [f'{w*100:.0f}%' if (w := v.get_width()) > 0 else '' for v in c ]
# set the bar label
ax.bar_label(c, labels=labels, label_type='center')
ax.set_xlabel("procent")
ax.set_title("tytul")

how to plot horizontal bar plot in loop to change the color on the bar based on the value in another column

I need to plot time(timestamp) vs space(intersectionId) single horizontal bar chart in matplotlib. The color of the bar will be changed at time intervals based on another column which will the currState. The colors will be
red,green,yellow. I have tried to create a dictionary of colors and values but unsure of how to use them in loop to change color based on the value. I have attached a sample csv below along with a code and what I try to achieve and what I have written till now.
category_colors = { 'red' : [2,3] , 'yellow' : [5,6] , 'green' : [7,8]}
date_test = df_sample['timestamp']
y_test = ['123456']
data = np.array(list(df_sample.currState))
fig, ax = plt.subplots(figsize=(10, 1))
ax = plt.barh(y_test,date_test,label="trafficsignal")
data_cum = data.cumsum
plt.xlabel('timestamp')
plt.ylabel('space')
plt.title('TimeSpace')
plt.legend()
plt.show()
timestamp currState IntersectionId
2020-02-26 16:12:13.131484 3 12345
2020-02-26 16:12:14.131484 3 12345
2020-02-26 16:12:15.131484 3 12345
2020-02-26 16:12:16.131484 5 12345
2020-02-26 16:12:17.131484 5 12345
2020-02-26 16:12:18.131484 5 12345
2020-02-26 16:12:19.131484 6 12345
2020-02-26 16:12:20.131484 6 12345
2020-02-26 16:12:21.131484 6 12345
Current plot:
Desired plot:
I am not aware of any plotting package that lets you create this plot in a straightforward way based on how your sample table is structured. One option could be to compute a start and an end variable and then create the plot like in the answers to this question, for example using the Altair Gantt chart like in this answer.
Here, I offer two solutions using matplotlib. By taking a look at the matplotlib gallery, I stumbled on the broken_barh plotting function which provides a way to create a plot like the one you want. There are two main hurdles to overcome when using it:
Deciding what unit to use for the x-axis and computing the xranges argument accordingly;
Creating and formatting the x ticks and tick labels.
Let me first create a sample dataset that resembles yours, note that you will need to adjust the color_dict to your codes:
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import matplotlib.pyplot as plt # v 3.3.2
import matplotlib.dates as mdates
## Create sample dataset
# Light color codes
gre = 1
yel_to_red = 2
red = 3
yel_to_gre = 4
color_dict = {1: 'green', 2: 'yellow', 3: 'red', 4: 'yellow'}
# Light color duration in seconds
sec_g = 45
sec_yr = 3
sec_r = 90
sec_yg = 1
# Light cycle
light_cycle = [gre, yel_to_red, red, yel_to_gre]
sec_cycle = [sec_g, sec_yr, sec_r, sec_yg]
ncycles = 3
sec_total = ncycles*sum(sec_cycle)
# Create variables and store them in a pandas dataframe with the datetime as index
IntersectionId = 12345
currState = np.repeat(ncycles*light_cycle, repeats=ncycles*sec_cycle)
time_sec = pd.date_range(start='2021-01-04 08:00:00', freq='S', periods=sec_total)
df = pd.DataFrame(dict(IntersectionId = np.repeat(12345, repeats=ncycles*sum(sec_cycle)),
currState = currState),
index = time_sec)
The broken_barh function takes the data in the format of tuples where for each colored rectangle that makes up the horizontal bar you need to provide the xy coordinates of the bottom-left corner as well as the length along each axis, like so:
xranges=[(x1_start, x1_length), (x2_start, x2_length), ... ], yranges=(y_all_start, y_all_width)
Note that yranges applies to all rectangles. The unit that is chosen for the x-axis determines how the data must be processed and how the x ticks and tick labels can be created. Here are two alternatives.
Matplotlib broken_barh with matplotlib date number as x-axis scale
In this approach, the timestamps of the rows where the light changes are extracted and then converted to matplotlib date numbers. This makes it possible to use a matplotlib date tick locator and formatter. This approach of using the matplotlib date for the x-axis values to simplify tick formatting was inspired by this answer by ImportanceOfBeingErnest.
For both this solution and the next one, the code for getting the indices of light changes and computing the lengths of the periods is based on this answer by Jaime, thanks to the general idea provided by this Gist by alimanfoo.
## Compute variables needed to define the plotting function arguments
states = np.array(df['currState'])
# Create a list of indices of the rows where the light changes
# (i.e. where a new currState code section starts)
starts_indices = np.where(np.concatenate(([True], states[:-1] != states[1:])))
# Append the last index to be able to compute the duration of the last
# light color period recorded in the dataset
starts_end_indices = np.append(starts_indices, states.size-1)
# Get the timestamps of those rows and convert them to python datetime format
starts_end_pydt = df.index[starts_end_indices].to_pydatetime()
# Convert the python timestamps to matplotlib date number that is used as the
# x-axis unit, this makes it easier to format the tick labels
starts_end_x = mdates.date2num(starts_end_pydt)
# Get the duration of each light color in matplotlib date number units
lengths = np.diff(starts_end_x)
# Add one second (computed in python datetime units) to the duration of
# the last light to make the bar chart left and right inclusive instead
# of just left inclusive
pydt_second = (max(starts_end_x) - min(starts_end_x))/starts_end_indices[-1]
lengths[-1] = lengths[-1] + pydt_second
# Compute the arguments for the broken_barh plotting function
xranges = [(start, length) for start, length in zip(starts_end_x, lengths)]
yranges = (0.75, 0.5)
colors = df['currState'][starts_end_indices[:-1]].map(color_dict)
## Create horizontal bar with colors by using the broken_barh function
## and format ticks and tick labels
fig, ax = plt.subplots(figsize=(10,2))
ax.broken_barh(xranges, yranges, facecolors=colors, zorder=2)
# Create and format x ticks and tick labels
loc = mdates.AutoDateLocator()
ax.xaxis.set_major_locator(loc)
formatter = mdates.AutoDateFormatter(loc)
formatter.scaled[1/(24.*60.)] = '%H:%M:%S' # adjust this according to time range
ax.xaxis.set_major_formatter(formatter)
# Format y-axis and create y tick and tick label
ax.set_ylim(0, 2)
ax.set_yticks([1])
ax.set_yticklabels([df['IntersectionId'][0]])
plt.grid(axis='x', alpha=0.5, zorder=1)
plt.show()
Matplotlib broken_barh with seconds as x-axis scale
This approach takes advantage of the fact that the indices of the table can be used to compute the lights' durations in seconds. The downside is that this time the x ticks and tick labels must be created from scratch. The code is written so that labels automatically have a nice format depending on the total duration covered by the dataset. The only thing that needs adjusting is the number of ticks, as this depends on how wide the figure is.
The code used to automatically select an appropriate time step between ticks is based on this answer by kennytm. The datetime string format codes are listed here.
## Compute the variables needed for the plotting function arguments
## using the currState variable
states = np.array(df['currState'])
# Create list of indices indicating the rows where the currState code
# changes: note the comma to unpack the tuple
starts_indices, = np.where(np.concatenate(([True], states[:-1] != states[1:])))
# Compute durations of each light in seconds
lengths = np.diff(starts_indices, append=states.size)
## Compute the arguments for the plotting function
xranges = [(start, length) for start, length in zip(starts_indices, lengths)]
yranges = (0.75, 0.5)
colors = df['currState'][starts_indices].map(color_dict)
## Create horizontal bar with colors using the broken_barh function
fig, ax = plt.subplots(figsize=(10,2))
ax.broken_barh(xranges, yranges, facecolors=colors, zorder=2)
## Create appropriate x ticks and tick labels
# Define time variable and parameters needed for computations
time = pd.DatetimeIndex(df.index).asi8 // 10**9 # time is in seconds
tmin = min(time)
tmax = max(time)
trange = tmax-tmin
# Choose the approximate number of ticks, the exact number depends on
# the automatically selected time step
approx_nticks = 6 # low number selected because figure width is only 10 inches
round_time_steps = [15, 30, 60, 120, 180, 240, 300, 600, 900, 1800, 3600, 7200, 14400]
time_step = min(round_time_steps, key=lambda x: abs(x - trange//approx_nticks))
# Create list of x ticks including the right boundary of the last time point
# in the dataset regardless of whether not it is aligned with the time step
timestamps = np.append(np.arange(tmin, tmax, time_step), tmax+1)
xticks = timestamps-tmin
ax.set_xticks(xticks)
# Create x tick labels with format depending on time step
fmt_time = '%H:%M:%S' if time_step <= 60 else '%H:%M'
xticklabels = [pd.to_datetime(ts, unit='s').strftime(fmt_time) for ts in timestamps]
ax.set_xticklabels(xticklabels)
## Format y-axis limits, tick and tick label
ax.set_ylim(0, 2)
ax.set_yticks([1])
ax.set_yticklabels([df['IntersectionId'][0]])
plt.grid(axis='x', alpha=0.5, zorder=1)
plt.show()
Further documentation: to_datetime, to_pydatetime, strftime

Horizontal stacked bar plot and add labels to each section

I am trying to replicate the following image in matplotlib and it seems barh is my only option. Though it appears that you can't stack barh graphs so I don't know what to do
If you know of a better python library to draw this kind of thing, please let me know.
This is all I could come up with as a start:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
people = ('A','B','C','D','E','F','G','H')
y_pos = np.arange(len(people))
bottomdata = 3 + 10 * np.random.rand(len(people))
topdata = 3 + 10 * np.random.rand(len(people))
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
ax.barh(y_pos, bottomdata,color='r',align='center')
ax.barh(y_pos, topdata,color='g',align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Distance')
plt.show()
I would then have to add labels individually using ax.text which would be tedious. Ideally I would like to just specify the width of the part to be inserted then it updates the center of that section with a string of my choosing. The labels on the outside (e.g. 3800) I can add myself later, it is mainly the labeling over the bar section itself and creating this stacked method in a nice way I'm having problems with. Can you even specify a 'distance' i.e. span of color in any way?
Edit 2: for more heterogeneous data. (I've left the above method since I find it more usual to work with the same number of records per series)
Answering the two parts of the question:
a) barh returns a container of handles to all the patches that it drew. You can use the coordinates of the patches to aid the text positions.
b) Following these two answers to the question that I noted before (see Horizontal stacked bar chart in Matplotlib), you can stack bar graphs horizontally by setting the 'left' input.
and additionally c) handling data that is less uniform in shape.
Below is one way you could handle data that is less uniform in shape is simply to process each segment independently.
import numpy as np
import matplotlib.pyplot as plt
# some labels for each row
people = ('A','B','C','D','E','F','G','H')
r = len(people)
# how many data points overall (average of 3 per person)
n = r * 3
# which person does each segment belong to?
rows = np.random.randint(0, r, (n,))
# how wide is the segment?
widths = np.random.randint(3,12, n,)
# what label to put on the segment (xrange in py2.7, range for py3)
labels = range(n)
colors ='rgbwmc'
patch_handles = []
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
left = np.zeros(r,)
row_counts = np.zeros(r,)
for (r, w, l) in zip(rows, widths, labels):
print r, w, l
patch_handles.append(ax.barh(r, w, align='center', left=left[r],
color=colors[int(row_counts[r]) % len(colors)]))
left[r] += w
row_counts[r] += 1
# we know there is only one patch but could enumerate if expanded
patch = patch_handles[-1][0]
bl = patch.get_xy()
x = 0.5*patch.get_width() + bl[0]
y = 0.5*patch.get_height() + bl[1]
ax.text(x, y, "%d%%" % (l), ha='center',va='center')
y_pos = np.arange(8)
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Distance')
plt.show()
Which produces a graph like this , with a different number of segments present in each series.
Note that this is not particularly efficient since each segment used an individual call to ax.barh. There may be more efficient methods (e.g. by padding a matrix with zero-width segments or nan values) but this likely to be problem-specific and is a distinct question.
Edit: updated to answer both parts of the question.
import numpy as np
import matplotlib.pyplot as plt
people = ('A','B','C','D','E','F','G','H')
segments = 4
# generate some multi-dimensional data & arbitrary labels
data = 3 + 10* np.random.rand(segments, len(people))
percentages = (np.random.randint(5,20, (len(people), segments)))
y_pos = np.arange(len(people))
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
colors ='rgbwmc'
patch_handles = []
left = np.zeros(len(people)) # left alignment of data starts at zero
for i, d in enumerate(data):
patch_handles.append(ax.barh(y_pos, d,
color=colors[i%len(colors)], align='center',
left=left))
# accumulate the left-hand offsets
left += d
# go through all of the bar segments and annotate
for j in range(len(patch_handles)):
for i, patch in enumerate(patch_handles[j].get_children()):
bl = patch.get_xy()
x = 0.5*patch.get_width() + bl[0]
y = 0.5*patch.get_height() + bl[1]
ax.text(x,y, "%d%%" % (percentages[i,j]), ha='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Distance')
plt.show()
You can achieve a result along these lines (note: the percentages I used have nothing to do with the bar widths, as the relationship in the example seems unclear):
See Horizontal stacked bar chart in Matplotlib for some ideas on stacking horizontal bar plots.
Imports and Test DataFrame
Tested in python 3.10, pandas 1.4.2, matplotlib 3.5.1, seaborn 0.11.2
For vertical stacked bars see Stacked Bar Chart with Centered Labels
import pandas as pd
import numpy as np
# create sample data as shown in the OP
np.random.seed(365)
people = ('A','B','C','D','E','F','G','H')
bottomdata = 3 + 10 * np.random.rand(len(people))
topdata = 3 + 10 * np.random.rand(len(people))
# create the dataframe
df = pd.DataFrame({'Female': bottomdata, 'Male': topdata}, index=people)
# display(df)
Female Male
A 12.41 7.42
B 9.42 4.10
C 9.85 7.38
D 8.89 10.53
E 8.44 5.92
F 6.68 11.86
G 10.67 12.97
H 6.05 7.87
Updated with matplotlib v3.4.2
Use matplotlib.pyplot.bar_label
See How to add value labels on a bar chart for additional details and examples with .bar_label.
labels = [f'{v.get_width():.2f}%' if v.get_width() > 0 else '' for v in c ] for python < 3.8, without the assignment expression (:=).
Plotted using pandas.DataFrame.plot with kind='barh'
ax = df.plot(kind='barh', stacked=True, figsize=(8, 6))
for c in ax.containers:
# customize the label to account for cases when there might not be a bar section
labels = [f'{w:.2f}%' if (w := v.get_width()) > 0 else '' for v in c ]
# set the bar label
ax.bar_label(c, labels=labels, label_type='center')
# uncomment and use the next line if there are no nan or 0 length sections; just use fmt to add a % (the previous two lines of code are not needed, in this case)
# ax.bar_label(c, fmt='%.2f%%', label_type='center')
# move the legend
ax.legend(bbox_to_anchor=(1.025, 1), loc='upper left', borderaxespad=0.)
# add labels
ax.set_ylabel("People", fontsize=18)
ax.set_xlabel("Percent", fontsize=18)
plt.show()
Using seaborn
sns.barplot does not have an option for stacked bar plots, however, sns.histplot and sns.displot can be used to create horizontal stacked bars.
seaborn typically requires the dataframe to be in a long, instead of wide, format, so use pandas.DataFrame.melt to reshape the dataframe.
Reshape dataframe
# convert the dataframe to a long form
df = df.reset_index()
df = df.rename(columns={'index': 'People'})
dfm = df.melt(id_vars='People', var_name='Gender', value_name='Percent')
# display(dfm)
People Gender Percent
0 A Female 12.414557
1 B Female 9.416027
2 C Female 9.846105
3 D Female 8.885621
4 E Female 8.438872
5 F Female 6.680709
6 G Female 10.666258
7 H Female 6.050124
8 A Male 7.420860
9 B Male 4.104433
10 C Male 7.383738
11 D Male 10.526158
12 E Male 5.916262
13 F Male 11.857227
14 G Male 12.966913
15 H Male 7.865684
sns.histplot: axes-level plot
fig, axe = plt.subplots(figsize=(8, 6))
sns.histplot(data=dfm, y='People', hue='Gender', discrete=True, weights='Percent', multiple='stack', ax=axe)
# iterate through each set of containers
for c in axe.containers:
# add bar annotations
axe.bar_label(c, fmt='%.2f%%', label_type='center')
axe.set_xlabel('Percent')
plt.show()
sns.displot: figure-level plot
g = sns.displot(data=dfm, y='People', hue='Gender', discrete=True, weights='Percent', multiple='stack', height=6)
# iterate through each facet / supbplot
for axe in g.axes.flat:
# iteate through each set of containers
for c in axe.containers:
# add the bar annotations
axe.bar_label(c, fmt='%.2f%%', label_type='center')
axe.set_xlabel('Percent')
plt.show()
Original Answer - before matplotlib v3.4.2
The easiest way to plot a horizontal or vertical stacked bar, is to load the data into a pandas.DataFrame
This will plot, and annotate correctly, even when all categories ('People'), don't have all segments (e.g. some value is 0 or NaN)
Once the data is in the dataframe:
It's easier to manipulate and analyze
It can be plotted with the matplotlib engine, using:
pandas.DataFrame.plot.barh
label_text = f'{width}' for annotations
pandas.DataFrame.plot.bar
label_text = f'{height}' for annotations
SO: Vertical Stacked Bar Chart with Centered Labels
These methods return a matplotlib.axes.Axes or a numpy.ndarray of them.
Using the .patches method unpacks a list of matplotlib.patches.Rectangle objects, one for each of the sections of the stacked bar.
Each .Rectangle has methods for extracting the various values that define the rectangle.
Each .Rectangle is in order from left the right, and bottom to top, so all the .Rectangle objects, for each level, appear in order, when iterating through .patches.
The labels are made using an f-string, label_text = f'{width:.2f}%', so any additional text can be added as needed.
Plot and Annotate
Plotting the bar, is 1 line, the remainder is annotating the rectangles
# plot the dataframe with 1 line
ax = df.plot.barh(stacked=True, figsize=(8, 6))
# .patches is everything inside of the chart
for rect in ax.patches:
# Find where everything is located
height = rect.get_height()
width = rect.get_width()
x = rect.get_x()
y = rect.get_y()
# The height of the bar is the data value and can be used as the label
label_text = f'{width:.2f}%' # f'{width:.2f}' to format decimal values
# ax.text(x, y, text)
label_x = x + width / 2
label_y = y + height / 2
# only plot labels greater than given width
if width > 0:
ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# move the legend
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# add labels
ax.set_ylabel("People", fontsize=18)
ax.set_xlabel("Percent", fontsize=18)
plt.show()
Example with Missing Segment
# set one of the dataframe values to 0
df.iloc[4, 1] = 0
Note the annotations are all in the correct location from df.
For this case, the above answers work perfectly. The issue I had, and didn't find a plug-and-play solution online, was that I often have to plot stacked bars in multi-subplot figures, with many values, which tend to have very non-homogenous amplitudes.
(Note: I work usually with pandas dataframes, and matplotlib. I couldn't make the bar_label() method of matplotlib to work all the times.)
So, I just give a kind of ad-hoc, but easily generalizable solution. In this example, I was working with single-row dataframes (for power-exchange monitoring purposes per hour), so, my dataframe (df) had just one row.
(I provide an example figure to show how this can be useful in very densely-packed plots)
[enter image description here][1]
[1]: https://i.stack.imgur.com/9akd8.png
'''
This implementation produces a stacked, horizontal bar plot.
df --> pandas dataframe. Columns are used as the iterator, and only the firs value of each column is used.
waterfall--> bool: if True, apart from the stack-direction, also a perpendicular offset is added.
cyclic_offset_x --> list (of any length) or None: loop through these values to use as x-offset pixels.
cyclic_offset_y --> list (of any length) or None: loop through these values to use as y-offset pixels.
ax --> matplotlib Axes, or None: if None, creates a new axis and figure.
'''
def magic_stacked_bar(df, waterfall=False, cyclic_offset_x=None, cyclic_offset_y=None, ax=None):
if isinstance(cyclic_offset_x, type(None)):
cyclic_offset_x = [0, 0]
if isinstance(cyclic_offset_y, type(None)):
cyclic_offset_y = [0, 0]
ax0 = ax
if isinstance(ax, type(None)):
fig, ax = plt.subplots()
fig.set_size_inches(19, 10)
cycler = 0;
prev = 0 # summation variable to make it stacked
for c in df.columns:
if waterfall:
y = c ; label = "" # bidirectional stack
else:
y = 0; label = c # unidirectional stack
ax.barh(y=y, width=df[c].values[0], height=1, left=prev, label = label)
prev += df[c].values[0] # add to sum-stack
offset_x = cyclic_offset_x[divmod(cycler, len(cyclic_offset_x))[1]]
offset_y = cyclic_offset_y[divmod(cycler, len(cyclic_offset_y))[1]]
ax.annotate(text="{}".format(int(df[c].values[0])), xy=(prev - df[c].values / 2, y),
xytext=(offset_x, offset_y), textcoords='offset pixels',
ha='center', va='top', fontsize=8,
arrowprops=dict(facecolor='black', shrink=0.01, width=0.3, headwidth=0.3),
bbox=dict(boxstyle='round', facecolor='grey', alpha=0.5))
cycler += 1
if not waterfall:
ax.legend() # if waterfall, the index annotates the columns. If
# waterfall ==False, the legend annotates the columns
if isinstance(ax0, type(None)):
ax.set_title("Voi la")
ax.set_xlabel("UltraWatts")
plt.show()
else:
return ax
''' (Sometimes, it is more tedious and requires some custom functions to make the labels look alright.
'''
A, B = 80,80
n_units = df.shape[1]
cyclic_offset_x = -A*np.cos(2*np.pi / (2*n_units) *np.arange(n_units))
cyclic_offset_y = B*np.sin(2*np.pi / (2*n_units) * np.arange(n_units)) + B/2

Categories

Resources