No whitespace between Seaborn barplot bars - python

I created a Seaborn barplot using the code below (it comes from https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/)
I would like all the bars to stack up without whitespace, but have been unable to do so. If I add width it complains about multiple values for width in barh. This is probably as seaborn has its own algo to determine the width. Is there anyway around it?
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Read data
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv")
# Draw Plot
plt.figure(figsize=(13, 10), dpi=80)
group_col = 'Gender'
order_of_bars = df.Stage.unique()[::-1]
colors = [plt.cm.Spectral(i/float(len(df[group_col].unique())-1)) for i in
range(len(df[group_col].unique()))]
for c, group in zip(colors, df[group_col].unique()):
sns.barplot(x='Users', y='Stage', data=df.loc[df[group_col]==group, :],
order=order_of_bars, color=c, label=group)
# Decorations
plt.xlabel("$Users$")
plt.ylabel("Stage of Purchase")
plt.yticks(fontsize=12)
plt.title("Population Pyramid of the Marketing Funnel", fontsize=22)
plt.legend()
plt.show()

Not a matplotlib expert by any means, so there may be a better way to do this. Perhaps you can do something like the following, which is similar to the approach in this answer:
# Draw Plot
fig, ax = plt.subplots(figsize=(13, 10), dpi=80)
...
for c, group in zip(colors, df[group_col].unique()):
sns.barplot(x='Users', y='Stage', data=df.loc[df[group_col]==group, :],
order=order_of_bars, color=c, label=group, ax=ax)
# Adjust height
for patch in ax.patches:
current_height = patch.get_height()
patch.set_height(1)
patch.set_y(patch.get_y() + current_height - 1)

Related

Custom Chart Formatting in Seaborn

I have a straightforward countplot in seaborn.
Code is:
ax = sns.countplot(x="days", data=df,color ='cornflowerblue')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set(xlabel='days', ylabel='Conversions')
ax.set_title("Days to Conversion")
for p in ax.patches:
count = p.get_height()
x = p.get_x() + p.get_width()/1.25
y = p.get_height()*1.01
ax.annotate(count, (x, y),ha='right')
Which produces:
I'm trying to make the chart a bit 'prettier'. Specifically I want to raise the height of the outline so it wont cross the count numbers on first bar, and make the count numbers centered with a small space about the bar. Can't get it to work.
Guidance please.
To set the labels, in the latest matplotlib version (3.4.2), there is a new function bar_label() which takes care of positioning. In older versions you could use your code, but use x = p.get_x() + p.get_width()/2 and set ax.text(..., ha='center').
To make room for the labels, an extra margin can be added via ax.margins(y=0.1).
import matplotlib.pyplot as plt
import seaborn as sns
df = sns.load_dataset('tips')
ax = sns.countplot(x="day", data=df, color='cornflowerblue')
ax.tick_params(axis='x', labelrotation=90)
ax.bar_label(ax.containers[-1])
ax.margins(y=0.1)
plt.tight_layout()
plt.show()

Categorical bubble plot in Python

I have a dataset with a lot of categorical variables and a binary target variable. What package is available in Python or other opensource GUI-based software where I can scatterplot two categorical variables on the X and Y axis and use the target variable as hue?
I have looked at Seaborn's catplot, but for that, one axis has to be numerical while the other categorical. So it doesn't serve this case.
For example, you can use the following:
import seaborn as sns
data = sns.load_dataset('titanic')
Here are the plot features I want
X-axis - 'embark_town'
Y-axis - 'class'
hue - 'alive'
I am of the opinion that if you have to rearrange a seaborn graph substantially, you can also create this graph from scratch with matplotlib. This gives us the opportunity to have a different approach to display this categorical vs categorical plot:
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle
import numpy as np
#dataframe and categories
import seaborn as sns
df = sns.load_dataset('titanic')
X = "embark_town"
Y = "class"
H = "alive"
bin_dic = {0: "yes", 1: "no"}
#counting the X-Y-H category entries
plt_df = df.groupby([X, Y, H]).size().to_frame(name="vals").reset_index()
#figure preparation with grid and scaling
fig, ax = plt.subplots(figsize=(9, 6))
ax.set_ylim(plt_df[Y].unique().size-0.5, -0.5)
ax.set_xlim(-0.5, plt_df[X].unique().size+1.0)
ax.grid(ls="--")
#upscale factor for scatter marker size
scale=10000/plt_df.vals.max()
#left marker for category 0
ax.scatter(plt_df[plt_df[H]==bin_dic[0]][X],
plt_df[plt_df[H]==bin_dic[0]][Y],
s=plt_df[plt_df[H]==bin_dic[0]].vals*scale,
c=[(0, 0, 1, 0.5)], edgecolor="black", marker=MarkerStyle("o", fillstyle="left"),
label=bin_dic[0])
#right marker for category 1
ax.scatter(plt_df[plt_df[H]==bin_dic[1]][X],
plt_df[plt_df[H]==bin_dic[1]][Y],
s=plt_df[plt_df[H]==bin_dic[1]].vals*scale,
c=[(1, 0, 0, 0.5)], edgecolor="black", marker=MarkerStyle("o", fillstyle="right"),
label=bin_dic[1])
#legend entries for the two categories
l = ax.legend(title="Survived the catastrophe", ncol=2, framealpha=0, loc="upper right", columnspacing=0.1,labelspacing=1.5)
l.legendHandles[0]._sizes = l.legendHandles[1]._sizes = [800]
#legend entries representing sizes
bubbles_n=5
bubbles_min = 50*(1+plt_df.vals.min()//50)
bubbles_step = 10*((plt_df.vals.max()-bubbles_min)//(10*(bubbles_n-1)))
bubbles_x = plt_df[X].unique().size+0.5
for i, bubbles_y in enumerate(np.linspace(0.5, plt_df[Y].unique().size-1, bubbles_n)):
#plot each legend bubble to indicate different marker sizes
ax.scatter(bubbles_x,
bubbles_y,
s=(bubbles_min + i*bubbles_step) * scale,
c=[(1, 0, 1, 0.6)], edgecolor="black")
#and label it with a value
ax.annotate(bubbles_min+i*bubbles_step, xy=(bubbles_x, bubbles_y),
ha="center", va="center",
fontsize="large", fontweight="bold", color="white")
plt.show()
Seaborn supports, just like matplotlib, the plotting of categorical vs categorical variables. One can create semitransparent markers that allow to see both categories, although this might be difficult to distinguish from one marker if both are of similar size. The essential plot is rather easy - we transform the dataframe with groupby and size to count the entries per triplet embarking town - class - alive category, then create a scatterplot with count value as markersize. However, the legend entry is the complicated part here. Either the markersize is tiny in the plot or massive in the legend. I tried to balance this but I am not happy with the result. A lot of manual adjusting necessary here, so seaborn is no real advantage here. Any suggestions on how to simplify this within seaborn are welcome.
import seaborn as sns
import matplotlib.pyplot as plt
#dataframe and categories
df = sns.load_dataset('titanic')
X = "embark_town"
Y = "class"
H = "alive"
#counting the X-Y-H category entries
plt_df = df.groupby([X, Y, H]).size().to_frame(name="people").reset_index()
#figure preparation with grid and scaling
fig, ax = plt.subplots(figsize=(6,4))
ax.set_ylim(plt_df[Y].unique().size-0.5, -0.5)
ax.set_xlim(-0.5, plt_df[X].unique().size+1.0)
ax.grid(ls="--")
#the actual scatterplot with markersize representing the counted values
sns.scatterplot(x=X,
y=Y,
size="people",
sizes=(100, 10000),
alpha=0.5,
edgecolor="black",
hue=H,
data=plt_df,
ax=ax)
#creating two legends because the hue markers differ in size from the others
handles, labels = ax.get_legend_handles_labels()
l = ax.legend(handles[:3], labels[:3], title="The poor die first", markerscale=2, loc="upper right")
ax.add_artist(l)
#and seaborn plots the size markers in black, so you would get massive black blobs in the legend
#we change the color and make them transparent
for handle in handles:
handle.set_facecolors((0, 1, 1, 0.5))
ax.legend(handles[4::2], labels[4::2], title="N° of people", loc="lower right", handletextpad=4, labelspacing=3, markerfirst=False)
plt.tight_layout()
plt.show()
Sample output:

Superimposing plots in seaborn cause x-axis to misallign

I am having an issue trying to superimpose plots with seaborn. I am able to generate the two plots separetly as
fig, (ax1,ax2) = plt.subplots(ncols=2,figsize=(30, 7))
sns.lineplot(data=data1, y='MSE',x='pct_gc',ax=ax1)
sns.boxplot(x="pct_gc", y="MSE", data=data2,ax=ax2,width=0.4)
The output looks like this:
But when i try to put both plots superimposed, but assiging both to the same ax object.
fig, (ax1,ax2) = plt.subplots(ncols=2,figsize=(30, 7))
sns.lineplot(data=data1, y='MSE',x='pct_gc',ax=ax1)
sns.boxplot(x="pct_gc", y="MSE", data=data2,ax=ax2,width=0.4)
I am not able to identify with the X axis in the Lineplot changes when superimposing both plots (both plots X axis go from 0 to 0.069).
My goal is for both plots to be superimposed, while keeping the same X axis range.
Seaborn's boxplot creates categorical x-axis, with all boxes nicely with the same distance. Internally the x-axis is numbered as 0, 1, 2, ... but externally it gets the labels from 0 to 0.069.
To combine a line plot with a boxplot, matplotlib's boxplot can be addressed directly, so that positions and widths can be set explicitly. When patch_artist=True, a rectangle is created (instead of just lines), for which a facecolor can be given. manage_ticks=False prevents that boxplot changes the x ticks and their limits. Optionally notch=True would accentuate the median a bit more, but depending on the data, the confidence interval might be too large and look weird.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
data1 = pd.DataFrame({'pct_gc': np.linspace(0, 0.069, 200), 'MSE': np.random.normal(0.02, 0.1, 200).cumsum()})
data1['pct_range'] = pd.cut(data1['pct_gc'], 10)
fig, ax1 = plt.subplots(ncols=1, figsize=(20, 7))
sns.lineplot(data=data1, y='MSE', x='pct_gc', ax=ax1)
for interval, color in zip(np.unique(data1['pct_range']), plt.cm.tab10.colors):
ax1.boxplot(data1[data1['pct_range'] == interval]['MSE'],
positions=[interval.mid], widths=0.4 * interval.length,
patch_artist=True, boxprops={'facecolor': color},
notch=False, medianprops={'color':'yellow', 'linewidth':2},
manage_ticks=False)
plt.show()

how to perform conditional area plotting with matplotlib?

I have created the following dataframe based on a range of data.
df['data_classification'] = df.myDatarange.apply(lambda a:'Very good' if a>=-90
else ('Good' if (a>= -100 or a<=-91)
else ('Moderate' if (a>= -110 or a<=-101)
else ('Poor' if (a>= -123 or a<=-111)
else ('Bad' if (a>= -140 or a<=-124)
else 'Off' )))))
I am planning to plot myDatarange with data_classification and somehow show the relation with different colour. I am very confused how to plot this.
I can plot myDatarange as a single lineplot, but how to relate the two data?
So far, I have tried the following:
x1 = df1.index
y1 = df1.myDatarange
f, (ax1,ax2) = plt.subplots(2,figsize=(5, 5))
ax1.plot(x1,y1,color='red', linewidth=1.9, alpha=0.9, label="myDataRange")
plt.show()
How can I plot the above range of data based on classification as area plot? Is there a better way than area plot to express my data? There are examples on the net, but not very clear on conditional side of it.
Seaborn's barplot can take a hue parameter to color each bar corresponding to the 'data_classification'. The new 'data_classification' column can be created quicker and easier to modify via pd.cut.
The barplot can be used as background for the lineplot to show the classification of each value.
Here is an example to get you started:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
df = pd.DataFrame({'myDatarange': np.random.randint(-150, -50, size=50)})
ranges = [-10**6, -140, -123, -110, -100, -90, 10**6]
df['data_classification'] = pd.cut(df['myDatarange'], ranges, right=False,
labels=['Off', 'Bad', 'Poor', 'Moderate', 'Good', 'Very Good'])
fig, ax1 = plt.subplots(figsize=(12, 4))
ax1.plot(df.index, df['myDatarange'], color='blue', linewidth=2, alpha=0.9, label="myDataRange")
sns.barplot(x=df.index, y=[df['myDatarange'].min()] * len(df),
hue='data_classification', alpha=0.5, palette='inferno', dodge=False, data=df, ax=ax1)
for bar in ax1.patches: # optionally set the bars to fill the complete background, default seaborn sets the width to about 80%
bar.set_width(1)
plt.legend(bbox_to_anchor=(1.02, 1.05) , loc='upper left')
plt.tight_layout()
plt.show()
PS: If you want to the 0 at the bottom (now at the top due to the negative y-values), you could call ax.invert_yaxis().

Remove whitespace from matplotlib heatplot

I have a heatplot in matplotlib for which I want to remove the whitespace to the north and east of the plot, as shown in the image below.
here is the code I'm using to generate the plots:
# plotting
figsize=(50,20)
y,x = 1,2
fig, axarry = plt.subplots(y,x, figsize=figsize)
p = axarry[1].pcolormesh(copy_matrix.values)
# put the major ticks at the middle of each cell
axarry[1].set_xticks(np.arange(copy_matrix.shape[1])+0.5, minor=False)
axarry[1].set_yticks(np.arange(copy_matrix.shape[0])+0.5, minor=False)
axarry[1].set_title(file_name, fontweight='bold')
axarry[1].set_xticklabels(copy_matrix.columns, rotation=90)
axarry[1].set_yticklabels(copy_matrix.index)
fig.colorbar(p, ax=axarry[1])
Phylo.draw(tree, axes=axarry[0])
The easiest way to do this is to use ax.axis('tight').
By default, matplotlib tries to choose "even" numbers for the axes limits. If you want the plot to be scaled to the strict limits of your data, use ax.axis('tight'). ax.axis('image') is similar, but will also make the cells of your "heatmap" square.
For example:
import numpy as np
import matplotlib.pyplot as plt
# Note the non-"even" size... (not a multiple of 2, 5, or 10)
data = np.random.random((73, 78))
fig, axes = plt.subplots(ncols=3)
for ax, title in zip(axes, ['Default', 'axis("tight")', 'axis("image")']):
ax.pcolormesh(data)
ax.set(title=title)
axes[1].axis('tight')
axes[2].axis('image')
plt.show()

Categories

Resources