I would like to plot two columns of a pandas dataframe as side by side box plots by category. This is not the same as the question presented in here: Grouped boxplot with seaborn where the two columns have lists inside them. The solution there did not work for me.
MWE
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(
[
[2, 4, "A"],
[4, 5, "C"],
[5, 4, "B"],
[10, 4.2, "A"],
[9, 3, "B"],
[3, 3, "C"]
], columns=['data1', 'data2', 'Categories'])
#Plotting by seaborn
fig, axs = plt.subplots(1, 1)
sns.boxplot(data=df,x="Categories",y='data1',ax=axs)
fig.show()
plt.waitforbuttonpress()
plt.close(fig)
The code above generates:
Replacing "data1" with "data2" in the boxplot line would give:
What I want is something like this:
You need to melt (convert to long format) the DataFrame first:
data = df.melt(id_vars=['Categories'], var_name='dataset', value_name='values')
print(data)
Prints:
Categories dataset values
0 A data1 2.0
1 A data2 4.0
2 C data1 4.0
3 C data2 5.0
4 B data1 5.0
5 B data2 4.0
6 A data1 10.0
7 A data2 4.2
8 B data1 9.0
9 B data2 3.0
10 C data1 3.0
11 C data2 3.0
Now you just have to use dataset as the hue. Since the plot is quite busy I moved the legend outside it.
sns.boxplot(data=data, x='Categories', y='values', hue='dataset')
plt.legend(title='dataset', loc='upper left', bbox_to_anchor=(1, 1))
Edit by OP:
I implemented this in a function such that it makes the plot with as many columns as desired in an ax and returns it.
def box_plot_columns(df,categories_column,list_of_columns,legend_title,y_axis_title,**boxplotkwargs):
columns = [categories_column] + list_of_columns
newdf = df[columns].copy()
data = newdf.melt(id_vars=[categories_column], var_name=legend_title, value_name=y_axis_title)
return sns.boxplot(data=data, x=categories_column, y=y_axis_title, hue=legend_title, **boxplotkwargs)
Usage Example:
fig, ax = plt.subplots(1,1)
ax = box_plot_columns(Data,"Categories",["data1","data2"],"dataset","values",ax=ax)
ax.set_title("My Plot")
plt.show()
Try This :
df = pd.DataFrame(
[
[2, 4, "A"],
[4, 5, "C"],
[5, 4, "B"],
[10, 4.2, "A"],
[9, 3, "B"],
[3, 3, "C"]
], columns=['data1', 'data2', 'Categories'])
#Plotting by seaborn
df_c = pd.melt(df, "Categories", var_name="data1", value_name="data2")
sns.factorplot("Categories",hue="data1", y="data2", data=df_c, kind="box")
Related
I am using following code to make 5 bars on 3 different data sets a, b and c. How can I show all colors in each bar. I don't want their value to add up. For example, in first bar if the value of Green is 1, Yellow is 3 and Red is 6 I don't want the final value to be 10 rather it should be 6 but all colors should appear till their final value. I don't want to use transparent colors or only bar outlines.
import matplotlib.pyplot as plt
import numpy as np
a = [1, 2, 3, 4, 5]
b = [3, 4, 1, 10, 9]
c = [6, 7, 2, 4, 6]
ind = np.arange(len(a))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(x=ind, height=a, width=0.35, align='center', label='Green',
facecolor='g')
ax.bar(x=ind, height=b, width=0.35, align='center', label='Yellow',
facecolor='y')
ax.bar(x=ind, height=c, width=0.35, align='center', label='Red', facecolor='r')
plt.xticks(ind, a)
plt.xlabel('Coordination Number')
plt.ylabel('Frequency')
plt.legend()
plt.show()
The reference value for the 'a' column is 6, but it was unclear if it is the maximum value. I understood it to be the maximum value and calculated the composition ratio.
I created a stacked graph based on the results.
import numpy as np
import pandas as pd
a = [1, 2, 3, 4, 5]
b = [3, 4, 1, 10, 9]
c = [6, 7, 2, 4, 6]
ind = np.arange(len(a))
df = pd.DataFrame({'a':a,'b':b,'c':c}, index=ind)
df['total'] = df.sum(axis=1)
df['max'] = df[['a','b','c']].max(axis=1)
df['aa'] = df['max']*(df['a']/df['total'])
df['bb'] = df['max']*(df['b']/df['total'])
df['cc'] = df['max']*(df['c']/df['total'])
df
a b c total max aa bb cc
0 1 3 6 10 6 0.600000 1.800000 3.600000
1 2 4 7 13 7 1.076923 2.153846 3.769231
2 3 1 2 6 3 1.500000 0.500000 1.000000
3 4 10 4 18 10 2.222222 5.555556 2.222222
4 5 9 6 20 9 2.250000 4.050000 2.700000
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(x=ind, height=df.loc[:,'aa'], bottom=0, width=0.35, align='center', label='Green',
facecolor='g')
ax.bar(x=ind, height=df.loc[:,'bb'], bottom=df.loc[:,'aa'], width=0.35, align='center', label='Yellow',
facecolor='y')
ax.bar(x=ind, height=df.loc[:,'cc'], bottom=df.loc[:,'aa']+df.loc[:,'bb'], width=0.35, align='center', label='Red', facecolor='r')
plt.xticks(ind, a)
plt.xlabel('Coordination Number')
plt.ylabel('Frequency')
plt.legend()
plt.show()
If I understand your question correctly, you want to show all colour bars starting from the same zero baseline and grouped together under their corresponding Number?
I'll use bokeh for plotting, since it provides an easy way to "offset" each bar in the group. To vary the amount of visual offset for each bar, change the second parameter of the dodge function. For this combination of widths, 0.05 seemed like a nice value.
from bokeh.io import output_notebook, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import dodge
output_notebook() # or output_file("chart.html") if not using Jupyter
x_axis_values = [str(x) for x in range(1, 6)]
data = {
"Coordination Number" : x_axis_values,
"Green" : [1, 2, 3, 4, 5],
"Yellow" : [3, 4, 1, 10, 9],
"Red" : [6, 7, 2, 4, 6]
}
src = ColumnDataSource(data=data)
p = figure(
x_range=x_axis_values, y_range=(0, 10), plot_height=275,
title="Offset Group Bar Chart", toolbar_location=None, tools="")
p.vbar(
x=dodge('Coordination Number', -0.05, range=p.x_range),
top='Green', width=0.2, source=src, color="#8DD3C7", legend_label="Green")
p.vbar(
x=dodge('Coordination Number', 0.0, range=p.x_range),
top='Yellow', width=0.2, source=src, color="#FFD92F", legend_label="Yellow")
p.vbar(
x=dodge('Coordination Number', 0.05, range=p.x_range),
top='Red', width=0.2, source=src, color="#E15759", legend_label="Red")
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
p.xaxis.axis_label = "Coordination Number"
p.yaxis.axis_label = "Frequency"
show(p)
The following code produces 2 side-by-side plots. However, I would like to push the right plot to the right so that its label shows detached from the left plot. How can I do it? I could not find any option in subplots, nor in countplot
here is the code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = {
'apples': [3, 2, 0, np.nan, 2],
'oranges': [0, 7, 7, 2, 7],
'figs':[1, np.nan, 10, np.nan, 10]
}
purchases = pd.DataFrame(data)
fig, ax =plt.subplots(1,2)
sns.countplot(purchases['apples'], ax=ax[0])
sns.countplot(purchases['oranges'], ax=ax[1])
show()
An option is tight_layout:
fig, ax =plt.subplots(1,2)
sns.countplot(purchases['apples'], ax=ax[0])
sns.countplot(purchases['oranges'], ax=ax[1])
plt.tight_layout()
output:
In order to make your data play nicely with seaborn, consider changing your dataframe to the "long" format and plotting all categories and their corresponding count with sns.catplot:
data = purchases.stack().droplevel(0).reset_index()
data.columns = ['fruit', 'number']
print(data.head(5))
# output:
# fruit number
# 0 apples 3.0
# 1 oranges 0.0
# 2 figs 1.0
# 3 apples 2.0
# 4 oranges 7.0
sns.catplot(data=data, x='number', kind='count', col='fruit')
plt.show()
output:
I am trying to display in a subplot all the boxplots corresponding to each columns in my dataframe df.
I have read this question:
Subplot for seaborn boxplot
and tried to implement the given solution:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
d = {'col1': [1, 2, 5.5, 100], 'col2': [3, 4, 0.2, 3], 'col3': [1, 4, 6, 30], 'col4': [2, 24, 0.2, 13], 'col5': [9, 84, 0.9, 3]}
df = pd.DataFrame(data=d)
names = list(df.columns)
f, axes = plt.subplots(round(len(names)/3), 3)
y = 0;
for name in names:
sns.boxplot(x= df[name], ax=axes[y])
y = y + 1
Unfortunately I get an error
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-111-489a538377fc> in <module>
3 y = 0;
4 for name in names:
----> 5 sns.boxplot(x= df[name], ax=axes[y])
6 y = y + 1
AttributeError: 'numpy.ndarray' object has no attribute 'boxplot'
I understand there is a problem with df[name] but I can't see how to fix it.
Would someone be able to point me in the right direction?
Thank you very much.
The problem comes from passing ax=axes[y] to boxplot. axes is a 2-d numpy array with shape (2, 3), that contains the grid of Matplotlib axes that you requested. So axes[y] is a 1-d numpy array that contains three Matplotlib AxesSubplotobjects. I suspect boxplot is attempting to dispatch to this argument, and it expects it to be an object with a boxplot method. You can fix this by indexing axes with the appropriate row and column that you want to use.
Here's your script, with a small change to do that:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
d = {'col1': [1, 2, 5.5, 100], 'col2': [3, 4, 0.2, 3], 'col3': [1, 4, 6, 30], 'col4': [2, 24, 0.2, 13], 'col5': [9, 84, 0.9, 3]}
df = pd.DataFrame(data=d)
names = list(df.columns)
f, axes = plt.subplots(round(len(names)/3), 3)
y = 0;
for name in names:
i, j = divmod(y, 3)
sns.boxplot(x=df[name], ax=axes[i, j])
y = y + 1
plt.tight_layout()
plt.show()
The plot:
How do I add conditional coloring to this table?
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'A':[16, 15, 14, 16],
'B': [3, -2, 5, 0],
'C': [200000, 3, 6, 800000],
'D': [51, -6, 3, 2]})
fig, ax = plt.subplots(figsize=(10,5))
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText = df.values, colLabels = df.columns, loc='center')
plt.show()
How do I add conditional coloring to the table where column A and column D values are greater than or equal to 15, the cells are red; else they're green. If column B and column C values are greater than or equal to 5, the cells are red; else they're green. This is what it should look like:
Generate a list of lists and feed it to cellColours. Make sure that the list of lists contains as many lists as you have rows in the data frame and each of the lists within the list of lists contains as many strings as you have columns in the data frame.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'A':[16, 15, 14, 16],
'B': [3, -2, 5, 0],
'C': [200000, 3, 6, 800000],
'D': [51, -6, 3, 2]})
colors = []
for _, row in df.iterrows():
colors_in_column = ["g", "g", "g", "g"]
if row["A"]>=15:
colors_in_column[0] = "r"
if row["B"]>=5:
colors_in_column[1] = "r"
if row["C"]>5:
colors_in_column[2] = "r"
if row["D"]>=15:
colors_in_column[3] = "r"
colors.append(colors_in_column)
fig, ax = plt.subplots(figsize=(10,5))
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText = df.values, colLabels = df.columns, loc='center', cellColours=colors)
plt.show()
I'm trying to annotate the values for a stacked horizontal bar graph created using pandas. Current code is below
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
d = {'group 1': [1, 2, 5, 7, 4, 5, 10],
'group 2': [5, 6, 1, 8, 2, 6, 2],
'group 3': [12, 2, 2, 4, 4, 8, 4]}
df = pd.DataFrame(d)
ax = df.plot.barh(stacked=True, figsize=(10,12))
for p in ax.patches:
ax.annotate(str(p.get_x()), xy=(p.get_x(), p.get_y()+0.2))
plt.legend(bbox_to_anchor=(0, -0.15), loc=3, prop={'size': 14}, frameon=False)
The problem is the annotation method I used gives the x starting points and not the values of each segment. I'd like to be able to annotate values of each segment in the center of each segment for each of the bars.
edit: for clarity, what I would like to achieve is something like this where the values are centered horizontally (and vertically) for each segment:
You can use the patches bbox to get the information you want.
ax = df.plot.barh(stacked=True, figsize=(10, 12))
for p in ax.patches:
left, bottom, width, height = p.get_bbox().bounds
ax.annotate(str(width), xy=(left+width/2, bottom+height/2),
ha='center', va='center')
Another possible solution is to get your df.values to a flatten array via values = df.values.flatten("F")
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
d = {'group 1': [1, 2, 5, 7, 4, 5, 10],
'group 2': [5, 6, 1, 8, 2, 6, 2],
'group 3': [12, 2, 2, 4, 4, 8, 4]}
df = pd.DataFrame(d)
ax = df.plot.barh(stacked=True, figsize=(10,12))
values = df.values.flatten("F")
for i, p in enumerate(ax.patches):
ax.annotate(str(values[i]), xy=(p.get_x()+ values[i]/2, p.get_y()+0.2))
plt.legend(bbox_to_anchor=(0, -0.15), loc=3, prop={'size': 14}, frameon=False);
From matplotlib 3.4.0 use matplotlib.pyplot.bar_label
The labels parameter can be used to customize annotations, but it's not required.
See this answer for additional details and examples.
Each group of containers must be iterated through to add labels.
Tested in python 3.10, pandas 1.4.2, matplotlib 3.5.1
Horizontal Stacked
d = {'group 1': [1, 2, 5, 7, 4, 5, 10],
'group 2': [5, 6, 1, 8, 2, 6, 2],
'group 3': [12, 2, 2, 4, 4, 8, 4]}
df = pd.DataFrame(d)
# add tot to sort the bars
df['tot'] = df.sum(axis=1)
# sort
df = df.sort_values('tot')
# plot all columns except tot
ax = df.iloc[:, :-1].plot.barh(stacked=True, figsize=(10, 12))
# iterate through each group of bars
for c in ax.containers:
# format the number of decimal places (if needed) and replace 0 with an empty string
labels = [f'{w:.0f}' if (w := v.get_width()) > 0 else '' for v in c ]
ax.bar_label(c, labels=labels, label_type='center')
Horizontal Grouped
Not stacked is a better presentation of the data, because it is easier to compare bar lengths visually.
# plot all columns except tot
ax = df.iloc[:, :-1].plot.barh(stacked=False, figsize=(8, 9))
# iterate through each group of bars
for c in ax.containers:
# format the number of decimal places (if needed) and replace 0 with an empty string
labels = [f'{w:.0f}' if (w := v.get_width()) > 0 else '' for v in c ]
ax.bar_label(c, labels=labels, label_type='center')
df view
group 1 group 2 group 3 tot
2 5 1 2 8
1 2 6 2 10
4 4 2 4 10
6 10 2 4 16
0 1 5 12 18
3 7 8 4 19
5 5 6 8 19