I have a dataset with a lot of categorical variables and a binary target variable. What package is available in Python or other opensource GUI-based software where I can scatterplot two categorical variables on the X and Y axis and use the target variable as hue?
I have looked at Seaborn's catplot, but for that, one axis has to be numerical while the other categorical. So it doesn't serve this case.
For example, you can use the following:
import seaborn as sns
data = sns.load_dataset('titanic')
Here are the plot features I want
X-axis - 'embark_town'
Y-axis - 'class'
hue - 'alive'
I am of the opinion that if you have to rearrange a seaborn graph substantially, you can also create this graph from scratch with matplotlib. This gives us the opportunity to have a different approach to display this categorical vs categorical plot:
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle
import numpy as np
#dataframe and categories
import seaborn as sns
df = sns.load_dataset('titanic')
X = "embark_town"
Y = "class"
H = "alive"
bin_dic = {0: "yes", 1: "no"}
#counting the X-Y-H category entries
plt_df = df.groupby([X, Y, H]).size().to_frame(name="vals").reset_index()
#figure preparation with grid and scaling
fig, ax = plt.subplots(figsize=(9, 6))
ax.set_ylim(plt_df[Y].unique().size-0.5, -0.5)
ax.set_xlim(-0.5, plt_df[X].unique().size+1.0)
ax.grid(ls="--")
#upscale factor for scatter marker size
scale=10000/plt_df.vals.max()
#left marker for category 0
ax.scatter(plt_df[plt_df[H]==bin_dic[0]][X],
plt_df[plt_df[H]==bin_dic[0]][Y],
s=plt_df[plt_df[H]==bin_dic[0]].vals*scale,
c=[(0, 0, 1, 0.5)], edgecolor="black", marker=MarkerStyle("o", fillstyle="left"),
label=bin_dic[0])
#right marker for category 1
ax.scatter(plt_df[plt_df[H]==bin_dic[1]][X],
plt_df[plt_df[H]==bin_dic[1]][Y],
s=plt_df[plt_df[H]==bin_dic[1]].vals*scale,
c=[(1, 0, 0, 0.5)], edgecolor="black", marker=MarkerStyle("o", fillstyle="right"),
label=bin_dic[1])
#legend entries for the two categories
l = ax.legend(title="Survived the catastrophe", ncol=2, framealpha=0, loc="upper right", columnspacing=0.1,labelspacing=1.5)
l.legendHandles[0]._sizes = l.legendHandles[1]._sizes = [800]
#legend entries representing sizes
bubbles_n=5
bubbles_min = 50*(1+plt_df.vals.min()//50)
bubbles_step = 10*((plt_df.vals.max()-bubbles_min)//(10*(bubbles_n-1)))
bubbles_x = plt_df[X].unique().size+0.5
for i, bubbles_y in enumerate(np.linspace(0.5, plt_df[Y].unique().size-1, bubbles_n)):
#plot each legend bubble to indicate different marker sizes
ax.scatter(bubbles_x,
bubbles_y,
s=(bubbles_min + i*bubbles_step) * scale,
c=[(1, 0, 1, 0.6)], edgecolor="black")
#and label it with a value
ax.annotate(bubbles_min+i*bubbles_step, xy=(bubbles_x, bubbles_y),
ha="center", va="center",
fontsize="large", fontweight="bold", color="white")
plt.show()
Seaborn supports, just like matplotlib, the plotting of categorical vs categorical variables. One can create semitransparent markers that allow to see both categories, although this might be difficult to distinguish from one marker if both are of similar size. The essential plot is rather easy - we transform the dataframe with groupby and size to count the entries per triplet embarking town - class - alive category, then create a scatterplot with count value as markersize. However, the legend entry is the complicated part here. Either the markersize is tiny in the plot or massive in the legend. I tried to balance this but I am not happy with the result. A lot of manual adjusting necessary here, so seaborn is no real advantage here. Any suggestions on how to simplify this within seaborn are welcome.
import seaborn as sns
import matplotlib.pyplot as plt
#dataframe and categories
df = sns.load_dataset('titanic')
X = "embark_town"
Y = "class"
H = "alive"
#counting the X-Y-H category entries
plt_df = df.groupby([X, Y, H]).size().to_frame(name="people").reset_index()
#figure preparation with grid and scaling
fig, ax = plt.subplots(figsize=(6,4))
ax.set_ylim(plt_df[Y].unique().size-0.5, -0.5)
ax.set_xlim(-0.5, plt_df[X].unique().size+1.0)
ax.grid(ls="--")
#the actual scatterplot with markersize representing the counted values
sns.scatterplot(x=X,
y=Y,
size="people",
sizes=(100, 10000),
alpha=0.5,
edgecolor="black",
hue=H,
data=plt_df,
ax=ax)
#creating two legends because the hue markers differ in size from the others
handles, labels = ax.get_legend_handles_labels()
l = ax.legend(handles[:3], labels[:3], title="The poor die first", markerscale=2, loc="upper right")
ax.add_artist(l)
#and seaborn plots the size markers in black, so you would get massive black blobs in the legend
#we change the color and make them transparent
for handle in handles:
handle.set_facecolors((0, 1, 1, 0.5))
ax.legend(handles[4::2], labels[4::2], title="N° of people", loc="lower right", handletextpad=4, labelspacing=3, markerfirst=False)
plt.tight_layout()
plt.show()
Sample output:
Related
I am having an issue trying to superimpose plots with seaborn. I am able to generate the two plots separetly as
fig, (ax1,ax2) = plt.subplots(ncols=2,figsize=(30, 7))
sns.lineplot(data=data1, y='MSE',x='pct_gc',ax=ax1)
sns.boxplot(x="pct_gc", y="MSE", data=data2,ax=ax2,width=0.4)
The output looks like this:
But when i try to put both plots superimposed, but assiging both to the same ax object.
fig, (ax1,ax2) = plt.subplots(ncols=2,figsize=(30, 7))
sns.lineplot(data=data1, y='MSE',x='pct_gc',ax=ax1)
sns.boxplot(x="pct_gc", y="MSE", data=data2,ax=ax2,width=0.4)
I am not able to identify with the X axis in the Lineplot changes when superimposing both plots (both plots X axis go from 0 to 0.069).
My goal is for both plots to be superimposed, while keeping the same X axis range.
Seaborn's boxplot creates categorical x-axis, with all boxes nicely with the same distance. Internally the x-axis is numbered as 0, 1, 2, ... but externally it gets the labels from 0 to 0.069.
To combine a line plot with a boxplot, matplotlib's boxplot can be addressed directly, so that positions and widths can be set explicitly. When patch_artist=True, a rectangle is created (instead of just lines), for which a facecolor can be given. manage_ticks=False prevents that boxplot changes the x ticks and their limits. Optionally notch=True would accentuate the median a bit more, but depending on the data, the confidence interval might be too large and look weird.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
data1 = pd.DataFrame({'pct_gc': np.linspace(0, 0.069, 200), 'MSE': np.random.normal(0.02, 0.1, 200).cumsum()})
data1['pct_range'] = pd.cut(data1['pct_gc'], 10)
fig, ax1 = plt.subplots(ncols=1, figsize=(20, 7))
sns.lineplot(data=data1, y='MSE', x='pct_gc', ax=ax1)
for interval, color in zip(np.unique(data1['pct_range']), plt.cm.tab10.colors):
ax1.boxplot(data1[data1['pct_range'] == interval]['MSE'],
positions=[interval.mid], widths=0.4 * interval.length,
patch_artist=True, boxprops={'facecolor': color},
notch=False, medianprops={'color':'yellow', 'linewidth':2},
manage_ticks=False)
plt.show()
I have created the following dataframe based on a range of data.
df['data_classification'] = df.myDatarange.apply(lambda a:'Very good' if a>=-90
else ('Good' if (a>= -100 or a<=-91)
else ('Moderate' if (a>= -110 or a<=-101)
else ('Poor' if (a>= -123 or a<=-111)
else ('Bad' if (a>= -140 or a<=-124)
else 'Off' )))))
I am planning to plot myDatarange with data_classification and somehow show the relation with different colour. I am very confused how to plot this.
I can plot myDatarange as a single lineplot, but how to relate the two data?
So far, I have tried the following:
x1 = df1.index
y1 = df1.myDatarange
f, (ax1,ax2) = plt.subplots(2,figsize=(5, 5))
ax1.plot(x1,y1,color='red', linewidth=1.9, alpha=0.9, label="myDataRange")
plt.show()
How can I plot the above range of data based on classification as area plot? Is there a better way than area plot to express my data? There are examples on the net, but not very clear on conditional side of it.
Seaborn's barplot can take a hue parameter to color each bar corresponding to the 'data_classification'. The new 'data_classification' column can be created quicker and easier to modify via pd.cut.
The barplot can be used as background for the lineplot to show the classification of each value.
Here is an example to get you started:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
df = pd.DataFrame({'myDatarange': np.random.randint(-150, -50, size=50)})
ranges = [-10**6, -140, -123, -110, -100, -90, 10**6]
df['data_classification'] = pd.cut(df['myDatarange'], ranges, right=False,
labels=['Off', 'Bad', 'Poor', 'Moderate', 'Good', 'Very Good'])
fig, ax1 = plt.subplots(figsize=(12, 4))
ax1.plot(df.index, df['myDatarange'], color='blue', linewidth=2, alpha=0.9, label="myDataRange")
sns.barplot(x=df.index, y=[df['myDatarange'].min()] * len(df),
hue='data_classification', alpha=0.5, palette='inferno', dodge=False, data=df, ax=ax1)
for bar in ax1.patches: # optionally set the bars to fill the complete background, default seaborn sets the width to about 80%
bar.set_width(1)
plt.legend(bbox_to_anchor=(1.02, 1.05) , loc='upper left')
plt.tight_layout()
plt.show()
PS: If you want to the 0 at the bottom (now at the top due to the negative y-values), you could call ax.invert_yaxis().
The inspiration for this question comes from breaks in pheatmap in R. The question is if I can define how "rough", how continuous/discrete my colouring and binning is in seaborn's heatmap. I have found a way how to do it using cmap and the number of colours used (eg. Discrete legend in seaborn heatmap plot). However, I have no idea how the assignment to those colour groups is done.
So the questions are, how are the data binned if I use cmap and force seaborn to use only a discrete set of colours=bins? How can I set it manually? Eg. in the case of R, I can set the breaks to be from 0 to 800 by step of 100 and pass it to the "breaks" argument.
breaksList = seq(0, 800, by = 100)
It is quite simple to use cmap and number of colours if my scale is linear but if I would like to have my bins=colorbar logarithmic or just not equally spaced, how would I do that?
To give a concrete example, let me present the example of flights data set. On the left is the original default plot, on the right, I choose 5 colours to have 5 bins. So how are defined the edges of those bins? Can I reset them so that I have eg. bins 0-200, 200-300, 300-400, 400-600, above 600?
(I'm intentionally using unequal bins to show what I mean.)
# choose 5 colours to create 5 bins
cmap = sns.color_palette('rocket', n_colors=5)
# run this without the cmap argument to get the first image
flights = sns.load_dataset("flights")
flights = flights.pivot("month", "year", "passengers")
ax = sns.heatmap(flights, cmap = cmap)
Thanks.
It seems there is a bug in seaborn that prevents the below idea from working correctly, so falling back to pure matplotlib:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
# choose 5 colours to create 5 bins
colors = sns.color_palette('rocket', 5)
levels = [0, 200, 300, 400, 600]
cmap, norm = matplotlib.colors.from_levels_and_colors(levels, colors, extend="max")
flights = sns.load_dataset("flights")
flights = flights.pivot("month", "year", "passengers")
fig, ax = plt.subplots()
im = ax.imshow(flights.values, cmap = cmap, norm=norm)
ax.set(xticks=range(flights.shape[1]), yticks=range(flights.shape[0]),
xticklabels=flights.columns, yticklabels=flights.index)
ax.tick_params(axis="x", rotation=90)
fig.colorbar(im, ax=ax, spacing="propotional")
plt.show()
I have written the following minimal Python code in order to plot various functions of x on the same X-axis.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from cycler import cycler
cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
xlabel='$X$'; ylabel='$Y$'
### Set tick features
plt.tick_params(axis='both',which='major',width=2,length=10,labelsize=18)
plt.tick_params(axis='both',which='minor',width=2,length=5)
#plt.set_axis_bgcolor('grey') # Doesn't work if I uncomment!
lines = ["-","--","-.",":"]
Nlayer=4
f, axarr = plt.subplots(Nlayer, sharex=True)
for a in range(1,Nlayer+1):
X = np.linspace(0,10,100)
Y = X**a
index = a-1 + np.int((a-1)/Nlayer)
axarr[a-1].plot(X, Y, linewidth=2.0+index, color=cycle[a], linestyle = lines[index], label='Layer = {}'.format(a))
axarr[a-1].legend(loc='upper right', prop={'size':6})
#plt.legend()
# Axes labels
plt.xlabel(xlabel, fontsize=20)
plt.ylabel(ylabel, fontsize=20)
plt.show()
However, the plots don't join together on the X-axis and I failed to get a common Y-axis label. It actually labels for the last plot (see attached figure). I also get a blank plot additionally which I couldn't get rid of.
I am using Python3.
The following code will produce the expected output :
without blank plot which was created because of the two plt.tick_params calls before creating the actual fig
with the gridspec_kw argument of subplots that allows you to control the space between rows and cols of subplots environment in order to join the different layer plots
with unique and centered common ylabel using fig.text with relative positioning and rotation argument (same thing is done to xlabel to get an homogeneous final result). One may note that, it can also be done by repositioning the ylabel with ax.yaxis.set_label_coords() after an usual call like ax.set_ylabel().
import numpy as np
import matplotlib.pyplot as plt
cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
xlabel='$X$'; ylabel='$Y$'
lines = ["-","--","-.",":"]
Nlayer = 4
fig, axarr = plt.subplots(Nlayer, sharex='col',gridspec_kw={'hspace': 0, 'wspace': 0})
X = np.linspace(0,10,100)
for i,ax in enumerate(axarr):
Y = X**(i+1)
ax.plot(X, Y, linewidth=2.0+i, color=cycle[i], linestyle = lines[i], label='Layer = {}'.format(i+1))
ax.legend(loc='upper right', prop={'size':6})
with axes labels, first option :
fig.text(0.5, 0.01, xlabel, va='center')
fig.text(0.01, 0.5, ylabel, va='center', rotation='vertical')
or alternatively :
# ax is here, the one of the last Nlayer plotted, i.e. Nlayer=4
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
# change y positioning to be in the horizontal center of all Nlayer, i.e. dynamically Nlayer/2
ax.yaxis.set_label_coords(-0.1,Nlayer/2)
which gives :
I also simplified your for loop by using enumerate to have an automatic counter i when looping over axarr.
I have a Pandas DataFrame with a column called "AXLES", which can take an integer value between 3-12. I am trying to use Seaborn's countplot() option to achieve the following plot:
left y axis shows the frequencies of these values occurring in the data. The axis extends are [0%-100%], tick marks at every 10%.
right y axis shows the actual counts, values correspond to tick marks determined by the left y axis (marked at every 10%.)
x axis shows the categories for the bar plots [3, 4, 5, 6, 7, 8, 9, 10, 11, 12].
Annotation on top of the bars show the actual percentage of that category.
The following code gives me the plot below, with actual counts, but I could not find a way to convert them into frequencies. I can get the frequencies using df.AXLES.value_counts()/len(df.index) but I am not sure about how to plug this information into Seaborn's countplot().
I also found a workaround for the annotations, but I am not sure if that is the best implementation.
Any help would be appreciated!
Thanks
plt.figure(figsize=(12,8))
ax = sns.countplot(x="AXLES", data=dfWIM, order=[3,4,5,6,7,8,9,10,11,12])
plt.title('Distribution of Truck Configurations')
plt.xlabel('Number of Axles')
plt.ylabel('Frequency [%]')
for p in ax.patches:
ax.annotate('%{:.1f}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))
EDIT:
I got closer to what I need with the following code, using Pandas' bar plot, ditching Seaborn. Feels like I'm using so many workarounds, and there has to be an easier way to do it. The issues with this approach:
There is no order keyword in Pandas' bar plot function as Seaborn's countplot() has, so I cannot plot all categories from 3-12 as I did in the countplot(). I need to have them shown even if there is no data in that category.
The secondary y-axis messes up the bars and the annotation for some reason (see the white gridlines drawn over the text and bars).
plt.figure(figsize=(12,8))
plt.title('Distribution of Truck Configurations')
plt.xlabel('Number of Axles')
plt.ylabel('Frequency [%]')
ax = (dfWIM.AXLES.value_counts()/len(df)*100).sort_index().plot(kind="bar", rot=0)
ax.set_yticks(np.arange(0, 110, 10))
ax2 = ax.twinx()
ax2.set_yticks(np.arange(0, 110, 10)*len(df)/100)
for p in ax.patches:
ax.annotate('{:.2f}%'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
You can do this by making a twinx axes for the frequencies. You can switch the two y axes around so the frequencies stay on the left and the counts on the right, but without having to recalculate the counts axis (here we use tick_left() and tick_right() to move the ticks and set_label_position to move the axis labels
You can then set the ticks using the matplotlib.ticker module, specifically ticker.MultipleLocator and ticker.LinearLocator.
As for your annotations, you can get the x and y locations for all 4 corners of the bar with patch.get_bbox().get_points(). This, along with setting the horizontal and vertical alignment correctly, means you don't need to add any arbitrary offsets to the annotation location.
Finally, you need to turn the grid off for the twinned axis, to prevent grid lines showing up on top of the bars (ax2.grid(None))
Here is a working script:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.ticker as ticker
# Some random data
dfWIM = pd.DataFrame({'AXLES': np.random.normal(8, 2, 5000).astype(int)})
ncount = len(dfWIM)
plt.figure(figsize=(12,8))
ax = sns.countplot(x="AXLES", data=dfWIM, order=[3,4,5,6,7,8,9,10,11,12])
plt.title('Distribution of Truck Configurations')
plt.xlabel('Number of Axles')
# Make twin axis
ax2=ax.twinx()
# Switch so count axis is on right, frequency on left
ax2.yaxis.tick_left()
ax.yaxis.tick_right()
# Also switch the labels over
ax.yaxis.set_label_position('right')
ax2.yaxis.set_label_position('left')
ax2.set_ylabel('Frequency [%]')
for p in ax.patches:
x=p.get_bbox().get_points()[:,0]
y=p.get_bbox().get_points()[1,1]
ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y),
ha='center', va='bottom') # set the alignment of the text
# Use a LinearLocator to ensure the correct number of ticks
ax.yaxis.set_major_locator(ticker.LinearLocator(11))
# Fix the frequency range to 0-100
ax2.set_ylim(0,100)
ax.set_ylim(0,ncount)
# And use a MultipleLocator to ensure a tick spacing of 10
ax2.yaxis.set_major_locator(ticker.MultipleLocator(10))
# Need to turn the grid on ax2 off, otherwise the gridlines end up on top of the bars
ax2.grid(None)
plt.savefig('snscounter.pdf')
I got it to work using core matplotlib's bar plot. I didn't have your data obviously, but adapting it to yours should be straight forward.
Approach
I used matplotlib's twin axis and plotted the data as bars on the second Axes object. The rest ist just some fiddeling around to get the ticks right and make annotations.
Hope this helps.
Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
tot = np.random.rand( 1 ) * 100
data = np.random.rand( 1, 12 )
data = data / sum(data,1) * tot
df = pd.DataFrame( data )
palette = sns.husl_palette(9, s=0.7 )
### Left Axis
# Plot nothing here, autmatically scales to second axis.
fig, ax1 = plt.subplots()
ax1.set_ylim( [0,100] )
# Remove grid lines.
ax1.grid( False )
# Set ticks and add percentage sign.
ax1.yaxis.set_ticks( np.arange(0,101,10) )
fmt = '%.0f%%'
yticks = matplotlib.ticker.FormatStrFormatter( fmt )
ax1.yaxis.set_major_formatter( yticks )
### Right Axis
# Plot data as bars.
x = np.arange(0,9,1)
ax2 = ax1.twinx()
rects = ax2.bar( x-0.4, np.asarray(df.loc[0,3:]), width=0.8 )
# Set ticks on x-axis and remove grid lines.
ax2.set_xlim( [-0.5,8.5] )
ax2.xaxis.set_ticks( x )
ax2.xaxis.grid( False )
# Set ticks on y-axis in 10% steps.
ax2.set_ylim( [0,tot] )
ax2.yaxis.set_ticks( np.linspace( 0, tot, 11 ) )
# Add labels and change colors.
for i,r in enumerate(rects):
h = r.get_height()
r.set_color( palette[ i % len(palette) ] )
ax2.text( r.get_x() + r.get_width()/2.0, \
h + 0.01*tot, \
r'%d%%'%int(100*h/tot), ha = 'center' )
I think you can first set the y major ticks manually and then modify each label
dfWIM = pd.DataFrame({'AXLES': np.random.randint(3, 10, 1000)})
total = len(dfWIM)*1.
plt.figure(figsize=(12,8))
ax = sns.countplot(x="AXLES", data=dfWIM, order=[3,4,5,6,7,8,9,10,11,12])
plt.title('Distribution of Truck Configurations')
plt.xlabel('Number of Axles')
plt.ylabel('Frequency [%]')
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
#put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe
ax.yaxis.set_ticks(np.linspace(0, total, 11))
#adjust the ticklabel to the desired format, without changing the position of the ticks.
_ = ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))