Understading why looping subplot does not work with loc() - python

Considering the following toy dataset:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 1)), columns=["A"])`
my_ID = ["control", "sample1", "sample2", "sample3"]
new_ID = 25 * my_ID
df["new_ID"] = new_ID
I make a subplot for df["A"] based on the unique values of df["new_ID"] as follows:
unique_list = list(df["new_ID"].unique())
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
sns.histplot(data=df[df["new_ID"] == unique_sample], x= "A", ax=ax)
ax.set_title(unique_sample)
The code functions well as we use df[df["new_ID"]].
However, the script will give us four same subplots if we write it as:
d = df.loc[df["new_ID"] == unique_sample]
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
g1 = sns.histplot(data=d, x= "A", ax=ax)
ax.set_title(unique_sample)
I am wondering why loc() cannot be used in iteration, although, we can use it in making single plots.

Cannot reproduce unfortunately. Using your code - there are four different plots. seaborn 0.11.2, pandas 1.3.5.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randint(0,100,size=(100, 1)), columns=["A"])
my_ID = ["control", "sample1", "sample2", "sample3"]
new_ID = 25 * my_ID
df["new_ID"] = new_ID
unique_list = df["new_ID"].unique()
print(unique_list)
#I make a subplot for df["A"] based on the unique values of df["new_ID"] as follows:
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
sns.histplot(data=df[df["new_ID"] == unique_sample], x= "A", ax=ax)
ax.set_title(unique_sample)
plt.tight_layout()
For what it's worth, consider using sns.displot(kind="hist", ...) for this job! It has built in support for facet grid and will do the "facetting" by new_ID easily for you, you don't have to manage the axes manually.
This is all it takes to get the equivalent display:
sns.displot(data=df, col="new_ID", col_wrap=2, x="A");

Related

How to set ordering of categories in Pandas stacked bar chart

I am trying to make a Pandas bar plot with custom-ordered categories (elements shown with different colors). Here's my code, where I expect the ordering of the categories from bottom to top to follow "catorder":
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df2 = pd.DataFrame({"series":["ser1","ser1","ser1", "ser2", "ser2","ser2"],
"cate":["aatu","boiler","heat pump","aatu","boiler","heat pump"],
"val": [6,15,24,7,15, 21] })
ac2 = pd.pivot_table(df2, values = "val", index = "series", columns = "cate")
catorder= ["heat pump","aatu","boiler"]
ac2.columns = pd.CategoricalIndex(ac2.columns.values,
ordered=True,
categories=catorder)
ac2.sort_index(axis = 1)
fig = plt.figure(figsize=(6,3.5))
ax1 = fig.add_subplot(111)
ac2.plot.bar(stacked=True, ax = ax1)
plt.show()
The problem is that it doesn't work. Categories are still in alphabetical order. Any ideas how to accomplish this common task?
You need to sort the data before plotting:
ac2.sort_index(axis=1).plot.bar(stacked=True, ax = ax1)
Output:

Divide axes.table multiindex into different columns

I am trying to create a crosstab with multiple index which I need to print on pdf.
I am using matplotlib for printing data on pdf and am not able to find any method which helps print dataframe directly to pdf.
So using axes.table to convert dataframe to table to be printed on pdf.
However, the 2 indexes in dataframe are combined in 1 in table.
See output below
Can these indexes ('ABC', 'D') separated in 2 columns like ABC | D .
If Yes, how?
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
import pandas as pd
pdf = matplotlib.backends.backend_pdf.PdfPages("test.pdf")
fig = plt.figure(figsize=(20, 20))
grid = plt.GridSpec(1, 2, wspace=0.2,width_ratios=[14, 6])
plt.autoscale()
ax0 = fig.add_subplot(grid[0 ,0])
ax1 = fig.add_subplot(grid[0, 1])
df = pd.DataFrame({'country': ['ABC','PQR','XYZ','ABC','PQR'], 'region': ['D','E','F','D','F'], 'month_day':[1,1,1,2,3],'sales' : [100,200,300,500,100]})
table=pd.pivot_table(df, values='sales', index=['country','region'], columns=['month_day'], aggfunc=sum, fill_value=0)
#for printing on pdf
the_table = ax0.table(cellText=table.values,colLabels=table.columns,rowLabels=table.index,loc='center')
pdf.savefig(fig, bbox_inches='tight')
pdf.close()
Found a solution after few tries.
table.reset_index(inplace=True)
worked in this case.
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
import pandas as pd
pdf = matplotlib.backends.backend_pdf.PdfPages("test.pdf")
fig = plt.figure(figsize=(20, 20))
grid = plt.GridSpec(1, 2, wspace=0.2,width_ratios=[14, 6])
plt.autoscale()
ax0 = fig.add_subplot(grid[0 ,0])
ax1 = fig.add_subplot(grid[0, 1])
df = pd.DataFrame({'country': ['ABC','PQR','XYZ','ABC','PQR'], 'region': ['D','E','F','D','F'], 'month_day':[1,1,1,2,3],'sales' : [100,200,300,500,100]})
table=pd.pivot_table(df, values='sales', index=['country','region'], columns=['month_day'], aggfunc=sum, fill_value=0)
table.reset_index(inplace=True)
the_table = ax0.table(cellText=table.values,colLabels=table.columns,colWidths=[0.07,0.06,0.04,0.04,0.04],loc='center')
ax0.axis("off")
ax1.axis("off")
plt.axis("off")
pdf.savefig(fig, bbox_inches='tight')
pdf.close()

How to create an automatic set of subplots from a looping through a list of dataframe columns

I want to create efficient code in which I can pass a set of dataframe columns to a for-loop or list comprehension and it will return a set of subplots of the same type (one for each variable) depending on the type of matplotlib or seaborn plot I want to use. I'm looking for an approach that is relatively agnostic to the type of graph.
I've only tried to create code using matplotlib. Below, I provide a simple dataframe and the latest code I tried.
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.DataFrame({"A": [1, 2,8,3,4,3], "B": [0, 2,4,8,3,2], "C": [0, 0,7,8,2,1]},
index =[1995,1996,1997,1998,1999,2000] )
df.index.name='Year'
fig, axs = plt.subplots(ncols=3,figsize=(8,4))
for yvar in df:
ts = pd.Series(yvar, index = df.index)
ts.plot(kind = 'line',ax=axs[i])
plt.show()
I expect to see a subplot for each variable that is passed to the loop.
Is this what you are looking for
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"A": [1, 2,8,3,4,3], "B": [0, 2,4,8,3,2], "C": [0, 0,7,8,2,1]},
index =[1995,1996,1997,1998,1999,2000] )
plt.figure(figsize=(10,10))
for i, col in enumerate(df.columns):
plt.subplot(1,3,i+1)
plt.plot(df.index, df[col], label=col)
plt.xticks(df.index)
plt.legend(loc='upper left')
plt.show()
Use plt.subplot(no_of_rows, no_of_cols, current_subplot_number) to set the current plotting to a subplot. Any plotting done will go the current_subplot_number.
Loop over both, the columns and the axes simultaneously. Show the plot outside the loop.
fig, axs = plt.subplots(ncols=len(df.columns), figsize=(8,4))
for ax, yvar in zip(axs.flat, df):
df[yvar].plot(ax=ax)
plt.show()
Alternatively, you can also directly plot the complete dataframe
fig, axs = plt.subplots(ncols=len(df.columns), figsize=(8,4))
df.plot(subplots=True, ax=axs)
plt.show()

Matplotlib: cbar.set_xticklabels has no effects

I've assigned the 365 days of a year to several clusters and I'm now trying to plot them on a heatmap.
My code works fine except that cbar.set_ticks(some_range) has no effects: the tick labels on my colorbar have the right text but the wrong position
Here is a MCVE
from datetime import date
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import matplotlib
import seaborn as sns
#create some random data
n_cluster = 4
index = pd.date_range('01/01/2016', end='31/12/2016', freq='1D')
df = pd.DataFrame(np.random.randint(0, n_cluster, len(index)),
index=index, columns=['cluster'])
pivot = df.pivot_table('cluster',
columns=[lambda x: x.weekofyear],
index= [lambda x: x.dayofweek])
#yticklabels of the heatmap
days = [date(2018, 1, d).strftime('%a')[:3] for d in range(1, 8)]
#get a discrete cmap
cmap = plt.cm.get_cmap('RdBu', n_cluster)
fig = plt.figure(figsize=(10,3))
gs = matplotlib.gridspec.GridSpec(1, 2, width_ratios=[50,1])
ax = plt.subplot(gs[0])
cbar = plt.subplot(gs[1])
sns.heatmap(pivot, square=True, cmap=cmap,
yticklabels=days, ax=ax, cbar_ax=cbar)
#There is something wrong here
cbar.set_yticks([i + 1/(2.0*n_cluster) for i in np.arange(0, 1, 1.0/n_cluster)])
#This one is ok
cbar.set_yticklabels(range(0, n_cluster))
Thanks for your help
As a workaround, the following adds the correct labels in the correct place,
cbar.yaxis.set_ticks([0.125, 0.375, 0.625, 0.875])
which looks like,
EDIT:
Or the more general suggestion of mfitzp,
cbar.yaxis.set_ticks([i + 1/(2.0*n_cluster)
for i in np.arange(0, 1, 1.0/n_cluster)])

Pandas plot: Assign Colors

I have many data frames that I am plotting for a presentation. These all have different columns, but all contain the same additional column foobar. At the moment, I am plotting these different data frames using
df.plot(secondary_y='foobar')
Unfortunately, since these data frames all have different additional columns with different ordering, the color of foobar is always different. This makes the presentation slides unnecessary complicated. I would like, throughout the different plots, assign that foobar is plotted bold and black.
Looking at the docs, the only thing coming close appears to be the parameter colormap - I would need to ensure that the xth color in the color map is always black, where x is the order of foobar in the data frame. Seems to be more complicated than it should be, also this wouldn't make it bold.
Is there a (better) approach?
I would suggest using matplotlib directly rather than the dataframe plotting methods. If df.plot returned the artists it added instead of an Axes object it wouldn't be too bad to change the color of the line after it was plotted.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def pandas_plot(ax, df, callout_key):
"""
Parameters
----------
ax : mpl.Axes
The axes to draw to
df : DataFrame
Data to plot
callout_key : str
key to highlight
"""
artists = {}
x = df.index.values
for k, v in df.iteritems():
style_kwargs = {}
if k == callout_key:
style_kwargs['c'] = 'k'
style_kwargs['lw'] = 2
ln, = ax.plot(x, v.values, **style_kwargs)
artists[k] = ln
ax.legend()
ax.set_xlim(np.min(x), np.max(x))
return artists
Usage:
fig, ax = plt.subplots()
ax2 = ax.twinx()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'sin': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'sin': -np.sin(th)}, index=th)
pandas_plot(ax, df, 'sin')
pandas_plot(ax2, df2, 'sin')
Perhaps you could define a function which handles the special column in a separate plot call:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
Using code from tcaswell's example,
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
fig, ax = plt.subplots()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'foobar': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'foobar': -np.sin(th)}, index=th)
emphasize_plot(ax, df, 'foobar', lw=2, c='k')
emphasize_plot(ax, df2, 'foobar', lw=2, c='k')
plt.show()
yields
I used #unutbut's answer and extended it to allow for a secondary y axis and correct legends:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
ax2 = ax.twinx()
df[columns].plot(ax=ax)
df[col].plot(ax=ax2, **emphargs)
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)

Categories

Resources