Pandas - stacked bar chart with column values for stacking - python

I have a data set with three sets of data: class type, neighborhood, and visibility.
I'm trying to create a bar chart that is both stacked and unstacked -- stacked by visibility, lined up by neighborhood. So basically, I'm looking for a combination of the unstacked-ness of this chart:
nbvis_gb = nbvis.sort_values(by=['visibility'],ascending=False).groupby(by='visibility',sort=False)
fig, ax = plt.subplots(nrows=1,ncols=2,figsize=(14,8),sharey=True)
for (i, j), ax,color in zip(nbvis_gb,ax.flatten(),colors_hood):
print(j['class'].values)
title = str(i)
j.plot.bar(ax=ax,colors=colors_hood)
ax.set_title(title, fontsize=20)
#ax.set_ylim(0,1.05)
ax.tick_params(labelsize=16)
ax.set_xticklabels(j['class'].values)
ax.legend_.remove()
ax.legend(loc=8,fontsize=20,ncol=4,bbox_to_anchor=(0,-.45))
fig.tight_layout(h_pad=2)
fig.suptitle('Visibility of containers by class and neighborhood',y=1.03,fontsize=24)
and the stacked-ness of this chart:
nbvis.unstack()['Neighborhood 1'].plot.bar(stacked=True)
Any help would be greatly appreciated!
Cheers,
Elizabeth

Consider melt and pivot_table of your dataframe to create a multi-index datafame aligned to your graph dimensions. Below outputs graph to screen and saves figure to png image in same folder using seaborn's color scheme. Of course adjust graph settings as needed.
Data
import numpy as np
import pandas as pd
from itertools import product
from matplotlib import pyplot as plt
import seaborn
np.random.seed(444)
df = pd.DataFrame(list(product(['bucket (1)', 'flower pot (2)', 'tarp (3)', 'trash (6)', 'toy (7)',
'piping/tubing (9)', 'other (10)'],
['visible containers', 'partial or not visible containers'])),
columns=['class', 'visibility']).assign(Neighborhood1 = abs(np.random.randn(14)),
Neighborhood2 = abs(np.random.randn(14)),
Neighborhood3 = abs(np.random.randn(14)),
Neighborhood4 = abs(np.random.randn(14)))
Graphing
seaborn.set()
def runplot(pvtdf):
fig, axes = plt.subplots(nrows=1, ncols=len(mdf['Neighborhood'].unique()))
for i, n in enumerate(mdf['Neighborhood'].unique()):
pvtdf.xs(n).plot(ax=axes[i], kind='bar', stacked=True, edgecolor='w',
figsize=(20,8), width=0.5, fontsize = 12,
title='{} - Visibility of containers \n by class and neighborhood'.format(n))
axes[i].title.set_size(16)
plt.tight_layout()
fig.savefig('Output.png')
plt.show()
plt.clf()
# MELT LONG
mdf = pd.melt(df, id_vars = ['class', 'visibility'], var_name='Neighborhood')
# PIVOT WIDE
pvtdf = mdf.pivot_table(index= ['Neighborhood', 'class'], columns='visibility', values='value')
runplot(pvtdf, n)
plt.close()
Output

here's one way you could do this. I used some dummy data:
df = pd.DataFrame({"class":['bucket', 'pot', 'tarp', 'trash', 'toy', 'tubing', 'other','bucket', 'pot', 'tarp', 'trash', 'toy', 'tubing', 'other',],
"visability":["visable", "visable","visable","visable","visable","visable","visable", "not visable","not visable","not visable","not visable","not visable","not visable","not visable",],
"n1":np.random.random(size=14),
"n2":np.random.random(size=14),
"n3":np.random.random(size=14),
"n4":np.random.random(size=14)})
I think the trick is to use bottom:
N=7
width = 0.095
w = 0
ind = np.arange(N) + .15
classes = ['bucket', 'pot', 'tarp', 'trash', 'toy', 'tubing', 'other']
neighborhoods = ['n1', 'n2', 'n3', 'n4']
fig, ax = plt.subplots()
top_colors = ['#ff9999', '#9999ff', '#e6b3ff', '#66ff66']
bottom_colors = ['#b30000', '#000066', '#7700b3', '#004d00']
for i, n in enumerate(neighborhoods):
vis = df[(df.visability == "visable")][n]
non_vis = df[df.visability == "not visable"][n]
rect1 = ax.bar(ind+w, vis, float(width), color=top_colors[i])
rect2 = ax.bar(ind+w, non_vis, width, color=bottom_colors[i], bottom=vis)
w += 0.15
extra_space = 0.05
ax.set_xticks(ind+width+xtra_space)
ax.set_xticklabels(('bucket', 'pot', 'tarp', 'trash', 'toy', 'tubing', 'other',))
ax.set_title('Visability of container types by class')
plt.show()

Related

How to change the order of subplots matplotlib

I have a data frame that contains average concentrations for 4 different sites based on season and year. The code I wrote produces a figure for each site, with four subplots for each season. Year is on the y-axis and concentration is on the x-axis.
Here's the link to my data: https://drive.google.com/file/d/1mVAsjRiFmMXaW0F8HBhadi1ZQPcUGIa7/view?usp=sharing
The issue is that the code automatically plots the subplots as
fall - spring
summer - winter
I want them to plot in chronological order, because that makes more sense that alphabetical:
spring - summer
fall - winter
Here is my code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import scipy.stats
main_dataframe = pd.read_csv('NOx_sznl.csv')
main_dataframe.rename(columns={'NOx_3168':'Banning NOx', 'NOx_2199':'Palm Springs NOx', 'NOx_2551':'El Centro NOx', 'NOx_3135':'Calexico NOx'}, inplace=True)
col = list(main_dataframe.columns)
col.remove('Year')
col.remove('Season')
for ind,station in enumerate(col):
df_new = main_dataframe[['Season', 'Year', col[ind]]]
###here I tried to reorder the seasons in the dataframe
df_new = df_new.set_index('Season')
df_new = df_new.loc[['Spring', 'Summer', 'Fall', 'Winter'], :]
df_new = df_new.reset_index()
###but it didn't change the outcome
df_new = df_new.set_index('Year')
# df_new['Betty Jo Mcneece Receiving Home'].astype('float')
df_new[col[ind]] = df_new[col[ind]]
grouped = df_new.groupby('Season')
rowlength = grouped.ngroups/2 # fix up if odd number of groups
fig, axs = plt.subplots(figsize=(15,10),
nrows=2, ncols=int(rowlength), # fix as above
gridspec_kw=dict(hspace=0.4))#, sharex='col', sharey='row') # Much control of gridspec
targets = zip(grouped.groups.keys(), axs.flatten())
for i, (key, ax) in enumerate(targets):
ax.plot(grouped.get_group(key)[col[ind]], marker='o', color='orange')
ax.set_ylim(0,)
ax.set_yticks(ax.get_yticks(),size=12)
#ax.set_xlim(2009,2020)
ax.set_xticks(np.arange(2009,2020,1))
ax.set_xticklabels(ax.get_xticks(), rotation = 45, size=12)
fig.suptitle("%s"%col[ind], fontsize=30)
# ax.set_title('%s')
plt.subplot(221)
plt.gca().set_title('Fall', fontsize=20)
plt.subplot(222)
plt.gca().set_title('Spring', fontsize=20)
plt.subplot(223)
plt.gca().set_title('Summer', fontsize=20)
plt.subplot(224)
plt.gca().set_title('Winter', fontsize=20)
plt.show()
I would apppreciate any help rearranging the subplots.
The order of the subplots is given by grouped.groups.keys() in targets = zip(grouped.groups.keys(), axs.flatten()) but the problem is further upstream in grouped = df_new.groupby('Season') which is where grouped.groups.keys() comes from. df.groupby() automatically sorts alphabetically unless you do sort=False, so grouped = df_new.groupby('Season', sort=False) should follow the order you provided when you made df_new.
Here is what your code looks like on my end so you can have an exact copy.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import scipy.stats
main_dataframe = pd.read_csv('NOx_sznl.csv')
main_dataframe.rename(columns={'NOx_3168': 'Banning NOx',
'NOx_2199': 'Palm Springs NOx',
'NOx_2551': 'El Centro NOx',
'NOx_3135': 'Calexico NOx'},
inplace=True)
col = list(main_dataframe.columns)
col.remove('Year')
col.remove('Season')
for ind, station in enumerate(col):
df_new = main_dataframe[['Season', 'Year', col[ind]]]
###here I tried to reorder the seasons in the dataframe
df_new = df_new.set_index('Season')
df_new = df_new.loc[['Spring', 'Summer', 'Fall', 'Winter'], :]
df_new = df_new.reset_index()
###but it didn't change the outcome
df_new = df_new.set_index('Year')
# df_new['Betty Jo Mcneece Receiving Home'].astype('float')
df_new[col[ind]] = df_new[col[ind]]
grouped = df_new.groupby('Season', sort=False)
rowlength = grouped.ngroups/2 # fix up if odd number of groups
fig, axs = plt.subplots(figsize=(15,10),
nrows=2, ncols=int(rowlength), # fix as above
gridspec_kw=dict(hspace=0.4))#, sharex='col', sharey='row') # Much control of gridspec
targets = zip(grouped.groups.keys(), axs.flatten())
for i, (key, ax) in enumerate(targets):
ax.plot(grouped.get_group(key)[col[ind]], marker='o', color='orange')
ax.set_ylim(0,)
ax.set_yticks(ax.get_yticks(),size=12)
#ax.set_xlim(2009,2020)
ax.set_xticks(np.arange(2009,2020,1))
ax.set_xticklabels(ax.get_xticks(), rotation = 45, size=12)
ax.set_title(key)
fig.suptitle("%s"%col[ind], fontsize=30)
plt.show()

Creating box plots by looping multiple columns

I am trying to create multiple box plot charts for about 5 columns in my dataframe (df_summ):
columns = ['dimension_a','dimension_b']
for i in columns:
sns.set(style = "ticks", palette = "pastel")
box_plot = sns.boxplot(y="measure", x=i,
palette=["m","g"],
data=df_summ_1500_delta)
sns.despine(offset=10, trim=True)
medians = df_summ_1500_delta.groupby([i])['measure'].median()
vertical_offset=df_summ_1500_delta['measure'].median()*-0.5
for xtick in box_plot.get_xticks():
box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
horizontalalignment='center',size='small',color='blue',weight='semibold')
My only issue is that they aren't be separated on different facets, but rather on top of each other.
Any help on how I can make both on their own separate chart with the x axis being 'dimension a' and the x axis of the second chart being 'dimension b'.
To draw two boxplots next to each other at each x-position, you can use a hue for dimension_a and dimension_b separately. These two columns need to be transformed (with pd.melt()) to "long form".
Here is a some example code starting from generated test data. Note that the order both for the x-values as for the hue-values needs to be enforced to be sure of their exact position. The individual box plots are distributed over a width of 0.8.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
df = pd.DataFrame({'dimension_a': np.random.choice(['hot', 'cold'], 100),
'dimension_b': np.random.choice(['hot', 'cold'], 100),
'measure': np.random.uniform(100, 500, 100)})
df.loc[df['dimension_a'] == 'hot', 'measure'] += 100
df.loc[df['dimension_a'] == 'cold', 'measure'] -= 100
x_order = ['hot', 'cold']
columns = ['dimension_a', 'dimension_b']
df1 = df.melt(value_vars=columns, var_name='dimension', value_name='value', id_vars='measure')
sns.set(style="ticks", palette="pastel")
ax = sns.boxplot(data=df1, x='value', order=x_order, y='measure',
hue='dimension', hue_order=columns, palette=["m", "g"], dodge=True)
ax.set_xlabel('')
sns.despine(offset=10, trim=True)
for col, dodge_dist in zip(columns, np.linspace(-0.4, 0.4, 2 * len(x_order) + 1)[1::2]):
medians = df.groupby([col])['measure'].median()
vertical_offset = df['measure'].median() * -0.5
for x_ind, xtick in enumerate(x_order):
ax.text(x_ind + dodge_dist, medians[xtick] + vertical_offset, f'{medians[xtick]:.2f}',
horizontalalignment='center', size='small', color='blue', weight='semibold')
plt.show()

Legends disappear when {"hist":False} in seaborn distplot

I have the following function:
Say hue="animals have three categories dog,bird,horse and we have two dataframes df_m and df_f consisting of data of male animals and women animals only, respectively.
The function plots three distplot of y (e.g y="weight") one for each hue={dog,bird,horse}. In each subplot we plot df_m[y] and df_f[y] such that I can compare the weight of male dogs/female dogs, male birds/female birds, male horses/female horses.
If I set distkwargs={"hist":False} when calling the function the legends ["F","M"] disappears, for some reason. Having distkwargs={"hist":True}` shows the legends
def plot_multi_kde_cat(self,dfs,y,hue,subkwargs={},distkwargs={},legends=[]):
"""
Create a subplot multi_kde with categories in the same plot
dfs: List
- DataFrames for each category e.g one for male and one for females
hue: string
- column for which each category is plotted (in each subplot)
"""
hues = dfs[0][hue].cat.categories
if len(hues)==2: #Only two categories
fig,axes = plt.subplots(1,2,**subkwargs) #Get axes and flatten them
axes=axes.flatten()
for ax,hu in zip(axes,hues):
for df in dfs:
sns.distplot(df.loc[df[hue]==hu,y],ax=ax,**distkwargs)
ax.set_title(f"Segment: {hu}")
ax.legend(legends)
else: #More than two categories: create a square grid and remove unsused axes
n_rows = int(np.ceil(np.sqrt(len(hues)))) #number of rows
fig,axes = plt.subplots(n_rows,n_rows,**subkwargs)
axes = axes.flatten()
for ax,hu in zip(axes,hues):
for df in dfs:
sns.distplot(df.loc[df[hue]==hu,y],ax=ax,**distkwargs)
ax.set_title(f"Segment: {hu}")
ax.legend(legends)
n_remove = len(axes)-len(hues) #number of axes to remove
if n_remove>0:
for ax in axes[-n_remove:]:
ax.set_visible(False)
fig.tight_layout()
return fig,axes
You can work around the problem by explicitly providing the label to the distplot. This forces a legend entry for each distplot. ax.legend() then already gets the correct labels.
Here is some minimal sample code to illustrate how everything works together:
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
def plot_multi_kde_cat(dfs, y, hue, subkwargs={}, distkwargs={}, legends=[]):
hues = np.unique(dfs[0][hue])
fig, axes = plt.subplots(1, len(hues), **subkwargs)
axes = axes.flatten()
for ax, hu in zip(axes, hues):
for df, legend_label in zip(dfs, legends):
sns.distplot(df.loc[df[hue] == hu, y], ax=ax, label=legend_label, **distkwargs)
ax.set_title(f"Segment: {hu}")
ax.legend()
N = 20
df_m = pd.DataFrame({'animal': np.random.choice(['tiger', 'horse'], N), 'weight': np.random.uniform(100, 200, N)})
df_f = pd.DataFrame({'animal': np.random.choice(['tiger', 'horse'], N), 'weight': np.random.uniform(80, 160, N)})
plot_multi_kde_cat([df_m, df_f], 'weight', 'animal',
subkwargs={}, distkwargs={'hist': False}, legends=['male', 'female'])
plt.show()

Display label stacked barh with values from dataframe

How can I display values for my stacked barh chart that come from a dataframe? How can I place the labels above their respective sections on each bar and modify the font so that it shows up well as a gray scale graphic?
It is related to this question but it has a list of values rather than two lists pulled from a pandas dataframe. If it were a singe list, I think I could pull values from a single record in the dataframe but with two lists, I'm not sure how to apply that to each bar in the bar graph.
My dataframe:
Delin. Group1 Group2 Group3 Group4 Group5
Census 0.2829 0.3387 0.2636 0.0795 0.0353
USPS 0.2538 0.3143 0.2901 0.1052 0.0366
My code:
import os
import pandas as pd
import time
#
start_time = time.time()
#
output_dir = r"C:\Some\Directory\For\Ouputs"
#
output_fig = "race_barh2.png"
#
fig_path = os.path.join(output_dir, output_fig)
#
os.chdir(output_dir)
#
input_csv = r"C:\Some\Directory\To\My.csv"
#
df = pd.read_csv(input_csv, delimiter = ",")
#
ax = df.plot.barh( stacked = True, color = ("#252525", "#636363", "#969696", "#cccccc", "#f7f7f7"), edgecolor = "black", linewidth = 1)
#
ax.set_xlabel("Percentage of Total", fontsize = 18)
#
ax.set_ylabel("Boundary Delineation", fontsize = 18)
#
ax.set_yticklabels(["Census", "USPS"])
#
ax.set_xticklabels(["0%", "20%", "40%", "60%", "80%", "100%"])
#
horiz_offset = 1.03
#
vert_offset = 1
#
ax.legend(bbox_to_anchor=(horiz_offset, vert_offset))
#
fig = ax.get_figure()
#
fig.savefig(fig_path, bbox_inches = "tight", dpi = 600)
#
#
#
end_time = round( time.time() - start_time, 5 )
#
print "Seconds elapsed: {0}".format(end_time)
You can do this similarly as in the referenced question, by annotating the bars. For a stacked bar chart you'll have to tweak the position of the labels a little to get them where you want. You can play around with the horizontalalignment, verticalalignment and adding a bit of a margin as I did (+.5).
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from cycler import cycler
#used gray colormap, you can use your own colors by replacing colormap='gray' with color=colors
colors = ["#252525", "#636363", "#969696", "#cccccc", "#f7f7f7"]
plt.rcParams['axes.prop_cycle'] = cycler(color=colors)
#dummy data
df = pd.DataFrame(np.random.randint(5, 8, (10, 3)), columns=['Group1', 'Group2', 'Group3'])
for col in df.columns.tolist():
df[col] = df[col].apply(lambda x:x*100 / df[col].sum())
ax = df.T.plot.barh(stacked=True, colormap='gray', edgecolor='black', linewidth=1)
for lbl in ax.patches:
ax.annotate("{:.0f}%".format(int(lbl.get_width())), (lbl.get_x(), lbl.get_y()+.5), verticalalignment='bottom', horizontalalignment='top', fontsize=8, color='black')
ax.legend(loc='center left', bbox_to_anchor=(1.0, .5))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.show()

Pandas plot: Assign Colors

I have many data frames that I am plotting for a presentation. These all have different columns, but all contain the same additional column foobar. At the moment, I am plotting these different data frames using
df.plot(secondary_y='foobar')
Unfortunately, since these data frames all have different additional columns with different ordering, the color of foobar is always different. This makes the presentation slides unnecessary complicated. I would like, throughout the different plots, assign that foobar is plotted bold and black.
Looking at the docs, the only thing coming close appears to be the parameter colormap - I would need to ensure that the xth color in the color map is always black, where x is the order of foobar in the data frame. Seems to be more complicated than it should be, also this wouldn't make it bold.
Is there a (better) approach?
I would suggest using matplotlib directly rather than the dataframe plotting methods. If df.plot returned the artists it added instead of an Axes object it wouldn't be too bad to change the color of the line after it was plotted.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def pandas_plot(ax, df, callout_key):
"""
Parameters
----------
ax : mpl.Axes
The axes to draw to
df : DataFrame
Data to plot
callout_key : str
key to highlight
"""
artists = {}
x = df.index.values
for k, v in df.iteritems():
style_kwargs = {}
if k == callout_key:
style_kwargs['c'] = 'k'
style_kwargs['lw'] = 2
ln, = ax.plot(x, v.values, **style_kwargs)
artists[k] = ln
ax.legend()
ax.set_xlim(np.min(x), np.max(x))
return artists
Usage:
fig, ax = plt.subplots()
ax2 = ax.twinx()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'sin': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'sin': -np.sin(th)}, index=th)
pandas_plot(ax, df, 'sin')
pandas_plot(ax2, df2, 'sin')
Perhaps you could define a function which handles the special column in a separate plot call:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
Using code from tcaswell's example,
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
fig, ax = plt.subplots()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'foobar': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'foobar': -np.sin(th)}, index=th)
emphasize_plot(ax, df, 'foobar', lw=2, c='k')
emphasize_plot(ax, df2, 'foobar', lw=2, c='k')
plt.show()
yields
I used #unutbut's answer and extended it to allow for a secondary y axis and correct legends:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
ax2 = ax.twinx()
df[columns].plot(ax=ax)
df[col].plot(ax=ax2, **emphargs)
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)

Categories

Resources