Plotting and color coding multiple y-axes - python

This is my first attempt using Matplotlib and I am in need of some guidance. I am trying to generate plot with 4 y-axes, two on the left and two on the right with shared x axis. Here's my dataset on shared dropbox folder
import pandas as pd
%matplotlib inline
url ='http://dropproxy.com/f/D34'
df= pd.read_csv(url, index_col=0, parse_dates=[0])
df.plot()
This is what the simple pandas plot looks like:
I would like to plot this similar to the example below, with TMAX and TMIN on primary y-axis (on same scale).
My attempt:
There's one example I found on the the matplotlib listserv..I am trying to adapt it to my data but something is not working right...Here's the script.
# multiple_yaxes_with_spines.py
# This is a template Python program for creating plots (line graphs) with 2, 3,
# or 4 y-axes. (A template program is one that you can readily modify to meet
# your needs). Almost all user-modifiable code is in Section 2. For most
# purposes, it should not be necessary to modify anything else.
# Dr. Phillip M. Feldman, 27 Oct, 2009
# Acknowledgment: This program is based on code written by Jae-Joon Lee,
# URL= http://matplotlib.svn.sourceforge.net/viewvc/matplotlib/trunk/matplotlib/
# examples/pylab_examples/multiple_yaxis_with_spines.py?revision=7908&view=markup
# Section 1: Import modules, define functions, and allocate storage.
import matplotlib.pyplot as plt
from numpy import *
def make_patch_spines_invisible(ax):
ax.set_frame_on(True)
ax.patch.set_visible(False)
for sp in ax.spines.itervalues():
sp.set_visible(False)
def make_spine_invisible(ax, direction):
if direction in ["right", "left"]:
ax.yaxis.set_ticks_position(direction)
ax.yaxis.set_label_position(direction)
elif direction in ["top", "bottom"]:
ax.xaxis.set_ticks_position(direction)
ax.xaxis.set_label_position(direction)
else:
raise ValueError("Unknown Direction : %s" % (direction,))
ax.spines[direction].set_visible(True)
# Create list to store dependent variable data:
y= [0, 0, 0, 0, 0]
# Section 2: Define names of variables and the data to be plotted.
# `labels` stores the names of the independent and dependent variables). The
# first (zeroth) item in the list is the x-axis label; remaining labels are the
# first y-axis label, second y-axis label, and so on. There must be at least
# two dependent variables and not more than four.
labels= ['Date', 'Maximum Temperature', 'Solar Radiation',
'Rainfall', 'Minimum Temperature']
# Plug in your data here, or code equations to generate the data if you wish to
# plot mathematical functions. x stores values of the independent variable;
# y[1], y[2], ... store values of the dependent variable. (y[0] is not used).
# All of these objects should be NumPy arrays.
# If you are plotting mathematical functions, you will probably want an array of
# uniformly spaced values of x; such an array can be created using the
# `linspace` function. For example, to define x as an array of 51 values
# uniformly spaced between 0 and 2, use the following command:
# x= linspace(0., 2., 51)
# Here is an example of 6 experimentally measured y1-values:
# y[1]= array( [3, 2.5, 7.3e4, 4, 8, 3] )
# Note that the above statement requires both parentheses and square brackets.
# With a bit of work, one could make this program read the data from a text file
# or Excel worksheet.
# Independent variable:
x = df.index
# First dependent variable:
y[1]= df['TMAX']
# Second dependent variable:
y[2]= df['RAD']
y[3]= df['RAIN']
y[4]= df['TMIN']
# Set line colors here; each color can be specified using a single-letter color
# identifier ('b'= blue, 'r'= red, 'g'= green, 'k'= black, 'y'= yellow,
# 'm'= magenta, 'y'= yellow), an RGB tuple, or almost any standard English color
# name written without spaces, e.g., 'darkred'. The first element of this list
# is not used.
colors= [' ', '#C82121', '#E48E3C', '#4F88BE', '#CF5ADC']
# Set the line width here. linewidth=2 is recommended.
linewidth= 2
# Section 3: Generate the plot.
N_dependents= len(labels) - 1
if N_dependents > 4: raise Exception, \
'This code currently handles a maximum of four independent variables.'
# Open a new figure window, setting the size to 10-by-7 inches and the facecolor
# to white:
fig= plt.figure(figsize=(16,9), dpi=120, facecolor=[1,1,1])
host= fig.add_subplot(111)
host.set_xlabel(labels[0])
# Use twinx() to create extra axes for all dependent variables except the first
# (we get the first as part of the host axes). The first element of y_axis is
# not used.
y_axis= (N_dependents+2) * [0]
y_axis[1]= host
for i in range(2,len(labels)+1): y_axis[i]= host.twinx()
if N_dependents >= 3:
# The following statement positions the third y-axis to the right of the
# frame, with the space between the frame and the axis controlled by the
# numerical argument to set_position; this value should be between 1.10 and
# 1.2.
y_axis[3].spines["right"].set_position(("axes", 1.15))
make_patch_spines_invisible(y_axis[3])
make_spine_invisible(y_axis[3], "right")
plt.subplots_adjust(left=0.0, right=0.8)
if N_dependents >= 4:
# The following statement positions the fourth y-axis to the left of the
# frame, with the space between the frame and the axis controlled by the
# numerical argument to set_position; this value should be between 1.10 and
# 1.2.
y_axis[4].spines["left"].set_position(("axes", -0.15))
make_patch_spines_invisible(y_axis[4])
make_spine_invisible(y_axis[4], "left")
plt.subplots_adjust(left=0.2, right=0.8)
p= (N_dependents+1) * [0]
# Plot the curves:
for i in range(1,N_dependents+1):
p[i], = y_axis[i].plot(x, y[i], colors[i],
linewidth=linewidth, label=labels[i])
# Set axis limits. Use ceil() to force upper y-axis limits to be round numbers.
host.set_xlim(x.min(), x.max())
host.set_xlabel(labels[0], size=16)
for i in range(1,N_dependents+1):
y_axis[i].set_ylim(0.0, ceil(y[i].max()))
y_axis[i].set_ylabel(labels[i], size=16)
y_axis[i].yaxis.label.set_color(colors[i])
for sp in y_axis[i].spines.itervalues():
sp.set_color(colors[i])
for obj in y_axis[i].yaxis.get_ticklines():
# `obj` is a matplotlib.lines.Line2D instance
obj.set_color(colors[i])
obj.set_markeredgewidth(3)
for obj in y_axis[i].yaxis.get_ticklabels():
obj.set_color(colors[i])
obj.set_size(12)
obj.set_weight(600)
# To enable the legend, uncomment the following two lines:
lines= p[1:]
host.legend(lines, [l.get_label() for l in lines])
plt.draw(); plt.show()
And the output
How can I put the scale on max and min temp on a same scale? Also, how can I get rid of second y-axis with black color, scaled from 0 to 10?
Is there a simpler way to achieve this?

How can I put the scale on max and min temp on a same scale?
Plot them in the same axes.
Also, how can I get rid of second y-axis with black color, scaled from 0 to 10?
Do not create that axes.
You want to plot four variables, two of them can go in the same subplot so you only need three subplots. But you are creating five of them?
Step by step
Keep in mind: different y scales <-> different subplots sharing x-axis.
Two variables with a common scale (left), two variables with independent scales (right).
Create the primary subplot, let's call it ax1. Plot everything you want in it, in this case TMIN and TMAX as stated in your question.
Create a twin subplot sharing x axis twinx(ax=ax1). Plot the third variable, say RAIN.
Create another twin subplot twinx(ax=ax1). Plot the fourth variable 'RAD'.
Adjust colors, labels, spine positions... to your heart's content.
Unsolicited advice: do not try to fix code you don't understand.

Variation of the original plot showing how you can plot variables on multiple axes
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
url ='http://dropproxy.com/f/D34'
df= pd.read_csv(url, index_col=0, parse_dates=[0])
fig = plt.figure()
ax = fig.add_subplot(111) # Primary y
ax2 = ax.twinx() # Secondary y
# Plot variables
ax.plot(df.index, df['TMAX'], color='red')
ax.plot(df.index, df['TMIN'], color='green')
ax2.plot(df.index, df['RAIN'], color='orange')
ax2.plot(df.index, df['RAD'], color='yellow')
# Custom ylimit
ax.set_ylim(0,50)
# Custom x axis date formats
import matplotlib.dates as mdates
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

I modified #bishopo's suggestions to generate what I wanted, however, the plot still needs some tweaking with font sizes for axes label.
Here's what I have done so far.
import pandas as pd
%matplotlib inline
url ='http://dropproxy.com/f/D34'
df= pd.read_csv(url, index_col=0, parse_dates=[0])
from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
import matplotlib.pyplot as plt
if 1:
# Set the figure size, dpi, and background color
fig = plt.figure(1, (16,9),dpi =300, facecolor = 'W',edgecolor ='k')
# Update the tick label size to 12
plt.rcParams.update({'font.size': 12})
host = host_subplot(111, axes_class=AA.Axes)
plt.subplots_adjust(right=0.75)
par1 = host.twinx()
par2 = host.twinx()
par3 = host.twinx()
offset = 60
new_fixed_axis = par2.get_grid_helper().new_fixed_axis
new_fixed_axis1 = host.get_grid_helper().new_fixed_axis
par2.axis["right"] = new_fixed_axis(loc="right",
axes=par2,
offset=(offset, 0))
par3.axis["left"] = new_fixed_axis1(loc="left",
axes=par3,
offset=(-offset, 0))
par2.axis["right"].toggle(all=True)
par3.axis["left"].toggle(all=True)
par3.axis["right"].set_visible(False)
# Set limit on both y-axes
host.set_ylim(-30, 50)
par3.set_ylim(-30,50)
host.set_xlabel("Date")
host.set_ylabel("Minimum Temperature ($^\circ$C)")
par1.set_ylabel("Solar Radiation (W$m^{-2}$)")
par2.set_ylabel("Rainfall (mm)")
par3.set_ylabel('Maximum Temperature ($^\circ$C)')
p1, = host.plot(df.index,df['TMIN'], 'm,')
p2, = par1.plot(df.index, df.RAD, color ='#EF9600', linestyle ='--')
p3, = par2.plot(df.index, df.RAIN, '#09BEEF')
p4, = par3.plot(df.index, df['TMAX'], '#FF8284')
par1.set_ylim(0, 36)
par2.set_ylim(0, 360)
host.legend()
host.axis["left"].label.set_color(p1.get_color())
par1.axis["right"].label.set_color(p2.get_color())
par2.axis["right"].label.set_color(p3.get_color())
par3.axis["left"].label.set_color(p4.get_color())
tkw = dict(size=5, width=1.5)
host.tick_params(axis='y', colors=p1.get_color(), **tkw)
par1.tick_params(axis='y', colors=p2.get_color(), **tkw)
par2.tick_params(axis='y', colors=p3.get_color(), **tkw)
par3.tick_params(axis='y', colors=p4.get_color(), **tkw)
host.tick_params(axis='x', **tkw)
par1.axis["right"].label.set_fontsize(16)
par2.axis["right"].label.set_fontsize(16)
par3.axis["left"].label.set_fontsize(16)
host.axis["bottom"].label.set_fontsize(16)
host.axis["left"].label.set_fontsize(16)
plt.figtext(.5,.92,'Weather Data', fontsize=22, ha='center')
plt.draw()
plt.show()
fig.savefig("Test1.png")
The output

Related

matplotlib visualization- positive negative proportion chart

I'm trying to make the same chart as below and wonder if matplotlib has a similar chart to make that.
The chart below is the result of the STM topic model in the R package
I have probs values using DMR in Python:
array([[0.07204196, 0.04238116],
[0.04518877, 0.30546978],
[0.0587892 , 0.19870868],
[0.16710107, 0.07182639],
[0.128209 , 0.02422131],
[0.15264449, 0.07237352],
[0.2250081 , 0.06986096],
[0.1337716 , 0.10750801],
[0.01197221, 0.06736039],
[0.00527367, 0.04028973]], dtype=float32)
These are the results and left is Negative words and right is Positive
Example of negative positive proportion chart:
It is possible to create something quite close to the image you included. I understood that the right column should be negative while the right column should be positive?
First make the data negative:
import numpy as np
arr = np.array([[0.07204196, 0.04238116],
[0.04518877, 0.30546978],
[0.0587892 , 0.19870868],
[0.16710107, 0.07182639],
[0.128209 , 0.02422131],
[0.15264449, 0.07237352],
[0.2250081 , 0.06986096],
[0.1337716 , 0.10750801],
[0.01197221, 0.06736039],
[0.00527367, 0.04028973]], dtype="float32")
# Make the right col negative
arr[:, 0] *= -1
Then we can plot like so:
from string import ascii_lowercase
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
for y, x in enumerate(arr.flatten()):
# Get a label from the alphabet
label = ascii_lowercase[y]
# Plot the point
ax.plot(x, y, "o", color="black")
# Annotate the point with the label
ax.annotate(label, xy=(x, y), xytext=(x - 0.036, y), verticalalignment="center")
# Add the vertical line at zero
ax.axvline(0, ls="--", color="black", lw=1.25)
# Make the x axis equal
xlim = abs(max(ax.get_xlim(), key=abs))
ax.set_xlim((-xlim, xlim))
# Remove y axis
ax.yaxis.set_visible(False)
# Add two text labels for the x axis
for text, x in zip(["Negative", "Positive"], ax.get_xlim()):
ax.text(x / 2, -3.75, f"{text} Reviews", horizontalalignment="center")
Which outputs:
You can tweak the values in the calls to ax.annotate and ax.text if you need to change the locations of the text on the plot or x-axis.
I'm not sure what the key part of the question is. That is, are you more interested in labeling the individual points based on the category, or if you're more concerned with the unique circle with a line through it. With the array provided it's a little confusing about what the data represents.
What I've assumed is each sublist represents a single category. With that in mind, what I did was make a separate column (delta) for the differences in values and then plotted them vs the index.
# New column (delta) with styling
df['delta'] = df[0]-df[1]
col = np.where(df.delta>0,'g',np.where(df.index<0,'b','r'))
fig, ax = plt.subplots(figsize =(10,7))
# Style it up a bit
plt.title('Differnece in Topic Proportion (Negative vs Positive)')
plt.xlabel('Net Review Score')
plt.ylabel('Index Number')
plt.tight_layout()
plt.savefig("Evolution of rapport of polarisation - (Aluminium).png")
plt.scatter(df['delta'], df.index, s=None, c=col, marker=None, linewidth=2)
plt.axvline(x = 0, color = 'b', label = 'axvline - full height', linestyle="--" )
That gives an out of this:

how to plot horizontal bar plot in loop to change the color on the bar based on the value in another column

I need to plot time(timestamp) vs space(intersectionId) single horizontal bar chart in matplotlib. The color of the bar will be changed at time intervals based on another column which will the currState. The colors will be
red,green,yellow. I have tried to create a dictionary of colors and values but unsure of how to use them in loop to change color based on the value. I have attached a sample csv below along with a code and what I try to achieve and what I have written till now.
category_colors = { 'red' : [2,3] , 'yellow' : [5,6] , 'green' : [7,8]}
date_test = df_sample['timestamp']
y_test = ['123456']
data = np.array(list(df_sample.currState))
fig, ax = plt.subplots(figsize=(10, 1))
ax = plt.barh(y_test,date_test,label="trafficsignal")
data_cum = data.cumsum
plt.xlabel('timestamp')
plt.ylabel('space')
plt.title('TimeSpace')
plt.legend()
plt.show()
timestamp currState IntersectionId
2020-02-26 16:12:13.131484 3 12345
2020-02-26 16:12:14.131484 3 12345
2020-02-26 16:12:15.131484 3 12345
2020-02-26 16:12:16.131484 5 12345
2020-02-26 16:12:17.131484 5 12345
2020-02-26 16:12:18.131484 5 12345
2020-02-26 16:12:19.131484 6 12345
2020-02-26 16:12:20.131484 6 12345
2020-02-26 16:12:21.131484 6 12345
Current plot:
Desired plot:
I am not aware of any plotting package that lets you create this plot in a straightforward way based on how your sample table is structured. One option could be to compute a start and an end variable and then create the plot like in the answers to this question, for example using the Altair Gantt chart like in this answer.
Here, I offer two solutions using matplotlib. By taking a look at the matplotlib gallery, I stumbled on the broken_barh plotting function which provides a way to create a plot like the one you want. There are two main hurdles to overcome when using it:
Deciding what unit to use for the x-axis and computing the xranges argument accordingly;
Creating and formatting the x ticks and tick labels.
Let me first create a sample dataset that resembles yours, note that you will need to adjust the color_dict to your codes:
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import matplotlib.pyplot as plt # v 3.3.2
import matplotlib.dates as mdates
## Create sample dataset
# Light color codes
gre = 1
yel_to_red = 2
red = 3
yel_to_gre = 4
color_dict = {1: 'green', 2: 'yellow', 3: 'red', 4: 'yellow'}
# Light color duration in seconds
sec_g = 45
sec_yr = 3
sec_r = 90
sec_yg = 1
# Light cycle
light_cycle = [gre, yel_to_red, red, yel_to_gre]
sec_cycle = [sec_g, sec_yr, sec_r, sec_yg]
ncycles = 3
sec_total = ncycles*sum(sec_cycle)
# Create variables and store them in a pandas dataframe with the datetime as index
IntersectionId = 12345
currState = np.repeat(ncycles*light_cycle, repeats=ncycles*sec_cycle)
time_sec = pd.date_range(start='2021-01-04 08:00:00', freq='S', periods=sec_total)
df = pd.DataFrame(dict(IntersectionId = np.repeat(12345, repeats=ncycles*sum(sec_cycle)),
currState = currState),
index = time_sec)
The broken_barh function takes the data in the format of tuples where for each colored rectangle that makes up the horizontal bar you need to provide the xy coordinates of the bottom-left corner as well as the length along each axis, like so:
xranges=[(x1_start, x1_length), (x2_start, x2_length), ... ], yranges=(y_all_start, y_all_width)
Note that yranges applies to all rectangles. The unit that is chosen for the x-axis determines how the data must be processed and how the x ticks and tick labels can be created. Here are two alternatives.
Matplotlib broken_barh with matplotlib date number as x-axis scale
In this approach, the timestamps of the rows where the light changes are extracted and then converted to matplotlib date numbers. This makes it possible to use a matplotlib date tick locator and formatter. This approach of using the matplotlib date for the x-axis values to simplify tick formatting was inspired by this answer by ImportanceOfBeingErnest.
For both this solution and the next one, the code for getting the indices of light changes and computing the lengths of the periods is based on this answer by Jaime, thanks to the general idea provided by this Gist by alimanfoo.
## Compute variables needed to define the plotting function arguments
states = np.array(df['currState'])
# Create a list of indices of the rows where the light changes
# (i.e. where a new currState code section starts)
starts_indices = np.where(np.concatenate(([True], states[:-1] != states[1:])))
# Append the last index to be able to compute the duration of the last
# light color period recorded in the dataset
starts_end_indices = np.append(starts_indices, states.size-1)
# Get the timestamps of those rows and convert them to python datetime format
starts_end_pydt = df.index[starts_end_indices].to_pydatetime()
# Convert the python timestamps to matplotlib date number that is used as the
# x-axis unit, this makes it easier to format the tick labels
starts_end_x = mdates.date2num(starts_end_pydt)
# Get the duration of each light color in matplotlib date number units
lengths = np.diff(starts_end_x)
# Add one second (computed in python datetime units) to the duration of
# the last light to make the bar chart left and right inclusive instead
# of just left inclusive
pydt_second = (max(starts_end_x) - min(starts_end_x))/starts_end_indices[-1]
lengths[-1] = lengths[-1] + pydt_second
# Compute the arguments for the broken_barh plotting function
xranges = [(start, length) for start, length in zip(starts_end_x, lengths)]
yranges = (0.75, 0.5)
colors = df['currState'][starts_end_indices[:-1]].map(color_dict)
## Create horizontal bar with colors by using the broken_barh function
## and format ticks and tick labels
fig, ax = plt.subplots(figsize=(10,2))
ax.broken_barh(xranges, yranges, facecolors=colors, zorder=2)
# Create and format x ticks and tick labels
loc = mdates.AutoDateLocator()
ax.xaxis.set_major_locator(loc)
formatter = mdates.AutoDateFormatter(loc)
formatter.scaled[1/(24.*60.)] = '%H:%M:%S' # adjust this according to time range
ax.xaxis.set_major_formatter(formatter)
# Format y-axis and create y tick and tick label
ax.set_ylim(0, 2)
ax.set_yticks([1])
ax.set_yticklabels([df['IntersectionId'][0]])
plt.grid(axis='x', alpha=0.5, zorder=1)
plt.show()
Matplotlib broken_barh with seconds as x-axis scale
This approach takes advantage of the fact that the indices of the table can be used to compute the lights' durations in seconds. The downside is that this time the x ticks and tick labels must be created from scratch. The code is written so that labels automatically have a nice format depending on the total duration covered by the dataset. The only thing that needs adjusting is the number of ticks, as this depends on how wide the figure is.
The code used to automatically select an appropriate time step between ticks is based on this answer by kennytm. The datetime string format codes are listed here.
## Compute the variables needed for the plotting function arguments
## using the currState variable
states = np.array(df['currState'])
# Create list of indices indicating the rows where the currState code
# changes: note the comma to unpack the tuple
starts_indices, = np.where(np.concatenate(([True], states[:-1] != states[1:])))
# Compute durations of each light in seconds
lengths = np.diff(starts_indices, append=states.size)
## Compute the arguments for the plotting function
xranges = [(start, length) for start, length in zip(starts_indices, lengths)]
yranges = (0.75, 0.5)
colors = df['currState'][starts_indices].map(color_dict)
## Create horizontal bar with colors using the broken_barh function
fig, ax = plt.subplots(figsize=(10,2))
ax.broken_barh(xranges, yranges, facecolors=colors, zorder=2)
## Create appropriate x ticks and tick labels
# Define time variable and parameters needed for computations
time = pd.DatetimeIndex(df.index).asi8 // 10**9 # time is in seconds
tmin = min(time)
tmax = max(time)
trange = tmax-tmin
# Choose the approximate number of ticks, the exact number depends on
# the automatically selected time step
approx_nticks = 6 # low number selected because figure width is only 10 inches
round_time_steps = [15, 30, 60, 120, 180, 240, 300, 600, 900, 1800, 3600, 7200, 14400]
time_step = min(round_time_steps, key=lambda x: abs(x - trange//approx_nticks))
# Create list of x ticks including the right boundary of the last time point
# in the dataset regardless of whether not it is aligned with the time step
timestamps = np.append(np.arange(tmin, tmax, time_step), tmax+1)
xticks = timestamps-tmin
ax.set_xticks(xticks)
# Create x tick labels with format depending on time step
fmt_time = '%H:%M:%S' if time_step <= 60 else '%H:%M'
xticklabels = [pd.to_datetime(ts, unit='s').strftime(fmt_time) for ts in timestamps]
ax.set_xticklabels(xticklabels)
## Format y-axis limits, tick and tick label
ax.set_ylim(0, 2)
ax.set_yticks([1])
ax.set_yticklabels([df['IntersectionId'][0]])
plt.grid(axis='x', alpha=0.5, zorder=1)
plt.show()
Further documentation: to_datetime, to_pydatetime, strftime

Plot bar chart using color to represent third dimension

My aim is to show a bar chart with 3-dim data, x, categorical and y1, y2 as continuous series; the bars should have heights from y1 and color to indicate y2.
This does not seem to be particularly obscure to me, but I didn't find a simple / built-in way to use a bar chart to visualise three dimensions -- I'm thinking mostly for exploratory purposes, before investigating relationships more formally.
Am I missing a type of plot in the libraries? Is there a good alternative to showing 3d data?
Anyway here are some things that I've tried that aren't particularly satisfying:
Some data for these attempts
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Example data with explicit (-ve) correlation in the two series
n = 10; sd = 2.5
fruits = [ 'Lemon', 'Cantaloupe', 'Redcurrant', 'Raspberry', 'Papaya',
'Apricot', 'Cherry', 'Durian', 'Guava', 'Jujube']
np.random.seed(101)
cost = np.random.uniform(3, 15, n)
harvest = 50 - (np.random.randn(n) * sd + cost)
df = pd.DataFrame(data={'fruit':fruits, 'cost':cost, 'harvest':harvest})
df.sort_values(by="cost", inplace=True) # preferrable to sort during plot only
# set up several subplots to show progress.
n_colors = 5; cmap_base = "coolwarm" # a diverging map
fig, axs = plt.subplots(3,2)
ax = axs.flat
Attempt 1 uses hue for the 3rd dim data in barplot. However, this produces a single color for each value in the series, and also seems to do odd things with the bar width & spacing.
import seaborn as sns
sns.barplot(ax=ax[0], x='fruit', y='cost', hue='harvest',
data=df, palette=cmap_base)
# fix the sns barplot label orientation
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=90)
Attempt 2 uses the pandas DataFrame.plot.bar, with a continuous color range, then adds a colorbar (need scalar mappable). I borrowed some techniques from medium post among others.
import matplotlib as mpl
norm = mpl.colors.Normalize(vmin=min(df.harvest), vmax=max(df.harvest), clip=True)
mapper1 = mpl.cm.ScalarMappable(norm=norm, cmap=cmap_base)
colors1 = [mapper1.to_rgba(x) for x in df.harvest]
df.plot.bar(ax=ax[1], x='fruit', y='cost', color=colors1, legend=False)
mapper1._A = []
plt.colorbar(mapper1, ax=ax[1], label='havest')
Attempt 3 builds on this, borrowing from https://gist.github.com/jakevdp/91077b0cae40f8f8244a to facilitate a discrete colormap.
def discrete_cmap(N, base_cmap=None):
"""Create an N-bin discrete colormap from the specified input map"""
# from https://gist.github.com/jakevdp/91077b0cae40f8f8244a
base = plt.cm.get_cmap(base_cmap)
color_list = base(np.linspace(0, 1, N))
cmap_name = base.name + str(N)
return base.from_list(cmap_name, color_list, N)
cmap_disc = discrete_cmap(n_colors, cmap_base)
mapper2 = mpl.cm.ScalarMappable(norm=norm, cmap=cmap_disc)
colors2 = [mapper2.to_rgba(x) for x in df.harvest]
df.plot.bar(ax=ax[2], x='fruit', y='cost', color=colors2, legend=False)
mapper2._A = []
cb = plt.colorbar(mapper2, ax=ax[2], label='havest')
cb.set_ticks(np.linspace(*cb.get_clim(), num=n_colors+1)) # indicate color boundaries
cb.set_ticklabels(["{:.0f}".format(t) for t in cb.get_ticks()]) # without too much precision
Finally, attempt 4 gives in to trying 3d in one plot and present in 2 parts.
sns.barplot(ax=ax[4], x='fruit', y='cost', data=df, color='C0')
ax[4].set_xticklabels(ax[4].get_xticklabels(), rotation=90)
sns.regplot(x='harvest', y='cost', data=df, ax=ax[5])
(1) is unusable - I'm clearly not using as intended. (2) is ok with 10 series but with more series is harder to tell whether a given sample is above/below average, for instance. (3) is quite nice and scales to 50 bars ok, but it is far from "out-of-the-box", too involved for a quick analysis. Moreover, the sm._A = [] seems like a hack but the code fails without it. Perhaps the solution in a couple of lines in (4) is a better way to go.
To come back to the question again: Is it possible easily produce a bar chart that displays 3d data? I've focused on using a small number of colors for the 3rd dimension for easier identification of trends, but I'm open to other suggestions.
I've posted a solution as well, which uses a lot of custom code to achieve what I can't really believe is not built in some graphing library of python.
edit:
the following code, using R's ggplot gives a reasonable approximation to (2) with built-in commands.
ggplot(data = df, aes(x =reorder(fruit, +cost), y = cost, fill=harvest)) +
geom_bar(data=df, aes(fill=harvest), stat='identity') +
scale_fill_gradientn(colours=rev(brewer.pal(7,"RdBu")))
The first 2 lines are more or less the minimal code for barplot, and the third changes the color palette.
So if this ease were available in python I'd love to know about it!
I'm posting an answer that does solve my aims of being simple at the point of use, still being useful with ~100 bars, and by leveraging the Fisher-Jenks 1d classifier from PySAL ends up handling outliers quite well (post about d3 coloring)
-- but overall is quite involved (50+ lines in the BinnedColorScaler class, posted at the bottom).
# set up the color binner
quantizer = BinnedColorScaler(df.harvest, k=5, cmap='coolwarm' )
# and plot dataframe with it.
df.plot.bar(ax=ax, x='fruit', y='cost',
color=df.harvest.map(quantizer.map_by_class))
quantizer.add_legend(ax, title='harvest') # show meaning of bins in legend
Using the following class that uses a nice 1d classifier from PySAL and borrows ideas from geoplot/geopandas libraries.
from pysal.esda.mapclassify import Fisher_Jenks
class BinnedColorScaler(object):
'''
give this an array-like data set, a bin count, and a colormap name, and it
- quantizes the data
- provides a bin lookup and a color mapper that can be used by pandas for selecting artist colors
- provides a method for a legend to display the colors and bin ranges
'''
def __init__(self, values, k=5, cmap='coolwarm'):
self.base_cmap = plt.cm.get_cmap(cmap) # can be None, text, or a cmap instane
self.bin_colors = self.base_cmap(np.linspace(0, 1, k)) # evenly-spaced colors
# produce bins - see _discrete_colorize in geoplot.geoplot.py:2372
self.binning = Fisher_Jenks(np.array(values), k)
self.bin_edges = np.array([self.binning.yb.min()] + self.binning.bins.tolist())
# some text for the legend (as per geopandas approx)
self.categories = [
'{0:.2f} - {1:.2f}'.format(self.bin_edges[i], self.bin_edges[i + 1])
for i in xrange(len(self.bin_edges) - 1)]
def map_by_class(self, val):
''' return a color for a given data value '''
#bin_id = self.binning.find_bin(val)
bin_id = self.find_bin(val)
return self.bin_colors[bin_id]
def find_bin(self, x):
''' unfortunately the pysal implementation seems to fail on bin edge
cases :(. So reimplement with the way we expect here.
'''
# wow, subtle. just <= instead of < in the uptos
x = np.asarray(x).flatten()
uptos = [np.where(value <= self.binning.bins)[0] for value in x]
bins = [v.min() if v.size > 0 else len(self.bins)-1 for v in uptos] #bail upwards
bins = np.asarray(bins)
if len(bins) == 1:
return bins[0]
else:
return bins
def add_legend(self, ax, title=None, **kwargs):
''' add legend showing the discrete colors and the corresponding data range '''
# following the geoplot._paint_hue_legend functionality, approx.
# generate a patch for each color in the set
artists, labels = [], []
for i in xrange(len(self.bin_colors)):
labels.append(self.categories[i])
artists.append(mpl.lines.Line2D(
(0,0), (1,0), mfc='none', marker='None', ls='-', lw=10,
color=self.bin_colors[i]))
return ax.legend(artists, labels, fancybox=True, title=title, **kwargs)

Setting xaxis values to %.2f on a bar graph

I have a bar graph with multiple data series and i want to set the xaxis values to a significant value of %.2f I already tried using the set_major formatter for the first graph, but it resets the values to 0, while the values should be like the second graph.
How can I fix this?
My code look like this:
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.ticker as mtick
# select the measurement location
MATH = "import/data/place"
SAVE = "save/location"
fig, axes = plt.subplots(figsize=(12,15),nrows=2, ncols=1) # size of the plots and the placing
fig.subplots_adjust(hspace=0.5) # set space between plots
DATA = pd.read_csv(MATH,delimiter=',',usecols = [2,3,4,5,6,7,8,9,10,11,12],names = ['set_t','set_rh',
'type','math','ref','LUFFT','VPL','VPR','VVL','VVR','PRO'], parse_dates=True)
# select the data
temp = DATA.loc[(DATA['type']=='T')&(DATA['math']=='dif')] # dif temperature data
rh = DATA.loc[((DATA['type']=='RH')&(DATA['math']=='dif'))] # dif relative humidity data
# plot temperature
fg = temp.plot.bar(x='set_t',y = ['LUFFT','VPL','VPR','VVL','VVR','PRO'],
color = ['b','firebrick','orange','forestgreen','darkturquoise','indigo'],
ax=axes[0])
fg.grid(True)
fg.set_ylabel('$ΔT$(°C)',fontsize = 12)
fg.set_xlabel('ref $T$ (°C)',fontsize = 12)
fg.set_title('Difference in T from reference at constant relative humidity 50%',fontsize = 15)
fg.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2f'))
fg.xaxis.set_major_formatter(mtick.FormatStrFormatter('%.2f'))
# plot relative humidity
df = rh.plot.bar(x='set_t',y = ['LUFFT','VPL','VPR','VVL','VVR','PRO'],
color = ['b','firebrick','orange','forestgreen','darkturquoise','indigo'],
ax=axes[1])
df.grid(True)
df.set_ylabel('$ΔU$(%)',fontsize = 12)
df.set_xlabel('ref $T$ (°C)',fontsize = 12)
df.set_title('Difference in U from reference at constant relative humidity 50%',fontsize = 15)
plt.tight_layout()
plt.savefig(SAVE + "_example.jpg")
plt.show()
A sample of my data:
07:40:00,07:50:00,39.85716354999982,51.00504745588235,T,dif,,0.14283645000018197,-0.07502069285698099,-0.15716354999978677,0.0020201234696060055,-0.07111703837193772,-0.0620802166664447,
07:40:00,07:50:00,39.85716354999982,51.00504745588235,RH,dif,,-0.40504745588239643,3.994952544117652,2.994952544117652,4.994952544117652,,6.994952544117652,
08:40:00,08:50:00,34.861160704969016,51.1297401832298,T,dif,,0.22883929503095857,0.2509082605481865,-0.2575243413326831,0.24864321659958222,0.14092262836431502,-0.04441070496899613,
08:40:00,08:50:00,34.861160704969016,51.1297401832298,RH,dif,,-0.32974018322978793,3.8702598167702007,2.8702598167702007,4.870259816770201,,6.870259816770201,
This is due to the fact that with a grouped barplot like this, made by Pandas, the x-axes loses its actual 'range', and the values associated with the tick position become the position itself. That's a bit cryptic, but you can see with fg.get_xlim() that the values have lost 'touch' with the original data, and are simply increasing integers. You can explore/debug the 'values' and 'positions' Matplotlib uses if you provide a FuncFormatter with a function like this:
def check_pos(val, pos):
print(val, pos)
return '%.2f' % val
This basically shows that no formatter is going to work for your case.
Luckily the ticklabels are set correctly (as text), so you could parse these to float, and format them as you wish.
Remove your formatter altogether, and set the xticklabels with:
fg.set_xticklabels(['%.2f' % float(x.get_text()) for x in fg.get_xticklabels()])
Note that Matplotlib itself is perfectly capable of preserving the correct tickvalues in combination with a bar plot, but you would have to do the 'grouping' etc yourself, so that's not very convenient as well.

Add Legend to Seaborn point plot

I am plotting multiple dataframes as point plot using seaborn. Also I am plotting all the dataframes on the same axis.
How would I add legend to the plot ?
My code takes each of the dataframe and plots it one after another on the same figure.
Each dataframe has same columns
date count
2017-01-01 35
2017-01-02 43
2017-01-03 12
2017-01-04 27
My code :
f, ax = plt.subplots(1, 1, figsize=figsize)
x_col='date'
y_col = 'count'
sns.pointplot(ax=ax,x=x_col,y=y_col,data=df_1,color='blue')
sns.pointplot(ax=ax,x=x_col,y=y_col,data=df_2,color='green')
sns.pointplot(ax=ax,x=x_col,y=y_col,data=df_3,color='red')
This plots 3 lines on the same plot. However the legend is missing. The documentation does not accept label argument .
One workaround that worked was creating a new dataframe and using hue argument.
df_1['region'] = 'A'
df_2['region'] = 'B'
df_3['region'] = 'C'
df = pd.concat([df_1,df_2,df_3])
sns.pointplot(ax=ax,x=x_col,y=y_col,data=df,hue='region')
But I would like to know if there is a way to create a legend for the code that first adds sequentially point plot to the figure and then add a legend.
Sample output :
I would suggest not to use seaborn pointplot for plotting. This makes things unnecessarily complicated.
Instead use matplotlib plot_date. This allows to set labels to the plots and have them automatically put into a legend with ax.legend().
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
date = pd.date_range("2017-03", freq="M", periods=15)
count = np.random.rand(15,4)
df1 = pd.DataFrame({"date":date, "count" : count[:,0]})
df2 = pd.DataFrame({"date":date, "count" : count[:,1]+0.7})
df3 = pd.DataFrame({"date":date, "count" : count[:,2]+2})
f, ax = plt.subplots(1, 1)
x_col='date'
y_col = 'count'
ax.plot_date(df1.date, df1["count"], color="blue", label="A", linestyle="-")
ax.plot_date(df2.date, df2["count"], color="red", label="B", linestyle="-")
ax.plot_date(df3.date, df3["count"], color="green", label="C", linestyle="-")
ax.legend()
plt.gcf().autofmt_xdate()
plt.show()
In case one is still interested in obtaining the legend for pointplots, here a way to go:
sns.pointplot(ax=ax,x=x_col,y=y_col,data=df1,color='blue')
sns.pointplot(ax=ax,x=x_col,y=y_col,data=df2,color='green')
sns.pointplot(ax=ax,x=x_col,y=y_col,data=df3,color='red')
ax.legend(handles=ax.lines[::len(df1)+1], labels=["A","B","C"])
ax.set_xticklabels([t.get_text().split("T")[0] for t in ax.get_xticklabels()])
plt.gcf().autofmt_xdate()
plt.show()
Old question, but there's an easier way.
sns.pointplot(x=x_col,y=y_col,data=df_1,color='blue')
sns.pointplot(x=x_col,y=y_col,data=df_2,color='green')
sns.pointplot(x=x_col,y=y_col,data=df_3,color='red')
plt.legend(labels=['legendEntry1', 'legendEntry2', 'legendEntry3'])
This lets you add the plots sequentially, and not have to worry about any of the matplotlib crap besides defining the legend items.
I tried using Adam B's answer, however, it didn't work for me. Instead, I found the following workaround for adding legends to pointplots.
import matplotlib.patches as mpatches
red_patch = mpatches.Patch(color='#bb3f3f', label='Label1')
black_patch = mpatches.Patch(color='#000000', label='Label2')
In the pointplots, the color can be specified as mentioned in previous answers. Once these patches corresponding to the different plots are set up,
plt.legend(handles=[red_patch, black_patch])
And the legend ought to appear in the pointplot.
This goes a bit beyond the original question, but also builds on #PSub's response to something more general---I do know some of this is easier in Matplotlib directly, but many of the default styling options for Seaborn are quite nice, so I wanted to work out how you could have more than one legend for a point plot (or other Seaborn plot) without dropping into Matplotlib right at the start.
Here's one solution:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# We will need to access some of these matplotlib classes directly
from matplotlib.lines import Line2D # For points and lines
from matplotlib.patches import Patch # For KDE and other plots
from matplotlib.legend import Legend
from matplotlib import cm
# Initialise random number generator
rng = np.random.default_rng(seed=42)
# Generate sample of 25 numbers
n = 25
clusters = []
for c in range(0,3):
# Crude way to get different distributions
# for each cluster
p = rng.integers(low=1, high=6, size=4)
df = pd.DataFrame({
'x': rng.normal(p[0], p[1], n),
'y': rng.normal(p[2], p[3], n),
'name': f"Cluster {c+1}"
})
clusters.append(df)
# Flatten to a single data frame
clusters = pd.concat(clusters)
# Now do the same for data to feed into
# the second (scatter) plot...
n = 8
points = []
for c in range(0,2):
p = rng.integers(low=1, high=6, size=4)
df = pd.DataFrame({
'x': rng.normal(p[0], p[1], n),
'y': rng.normal(p[2], p[3], n),
'name': f"Group {c+1}"
})
points.append(df)
points = pd.concat(points)
# And create the figure
f, ax = plt.subplots(figsize=(8,8))
# The KDE-plot generates a Legend 'as usual'
k = sns.kdeplot(
data=clusters,
x='x', y='y',
hue='name',
shade=True,
thresh=0.05,
n_levels=2,
alpha=0.2,
ax=ax,
)
# Notice that we access this legend via the
# axis to turn off the frame, set the title,
# and adjust the patch alpha level so that
# it closely matches the alpha of the KDE-plot
ax.get_legend().set_frame_on(False)
ax.get_legend().set_title("Clusters")
for lh in ax.get_legend().get_patches():
lh.set_alpha(0.2)
# You would probably want to sort your data
# frame or set the hue and style order in order
# to ensure consistency for your own application
# but this works for demonstration purposes
groups = points.name.unique()
markers = ['o', 'v', 's', 'X', 'D', '<', '>']
colors = cm.get_cmap('Dark2').colors
# Generate the scatterplot: notice that Legend is
# off (otherwise this legend would overwrite the
# first one) and that we're setting the hue, style,
# markers, and palette using the 'name' parameter
# from the data frame and the number of groups in
# the data.
p = sns.scatterplot(
data=points,
x="x",
y="y",
hue='name',
style='name',
markers=markers[:len(groups)],
palette=colors[:len(groups)],
legend=False,
s=30,
alpha=1.0
)
# Here's the 'magic' -- we use zip to link together
# the group name, the color, and the marker style. You
# *cannot* retreive the marker style from the scatterplot
# since that information is lost when rendered as a
# PathCollection (as far as I can tell). Anyway, this allows
# us to loop over each group in the second data frame and
# generate a 'fake' Line2D plot (with zero elements and no
# line-width in our case) that we can add to the legend. If
# you were overlaying a line plot or a second plot that uses
# patches you'd have to tweak this accordingly.
patches = []
for x in zip(groups, colors[:len(groups)], markers[:len(groups)]):
patches.append(Line2D([0],[0], linewidth=0.0, linestyle='',
color=x[1], markerfacecolor=x[1],
marker=x[2], label=x[0], alpha=1.0))
# And add these patches (with their group labels) to the new
# legend item and place it on the plot.
leg = Legend(ax, patches, labels=groups,
loc='upper left', frameon=False, title='Groups')
ax.add_artist(leg);
# Done
plt.show();
Here's the output:

Categories

Resources