Divide axes.table multiindex into different columns - python

I am trying to create a crosstab with multiple index which I need to print on pdf.
I am using matplotlib for printing data on pdf and am not able to find any method which helps print dataframe directly to pdf.
So using axes.table to convert dataframe to table to be printed on pdf.
However, the 2 indexes in dataframe are combined in 1 in table.
See output below
Can these indexes ('ABC', 'D') separated in 2 columns like ABC | D .
If Yes, how?
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
import pandas as pd
pdf = matplotlib.backends.backend_pdf.PdfPages("test.pdf")
fig = plt.figure(figsize=(20, 20))
grid = plt.GridSpec(1, 2, wspace=0.2,width_ratios=[14, 6])
plt.autoscale()
ax0 = fig.add_subplot(grid[0 ,0])
ax1 = fig.add_subplot(grid[0, 1])
df = pd.DataFrame({'country': ['ABC','PQR','XYZ','ABC','PQR'], 'region': ['D','E','F','D','F'], 'month_day':[1,1,1,2,3],'sales' : [100,200,300,500,100]})
table=pd.pivot_table(df, values='sales', index=['country','region'], columns=['month_day'], aggfunc=sum, fill_value=0)
#for printing on pdf
the_table = ax0.table(cellText=table.values,colLabels=table.columns,rowLabels=table.index,loc='center')
pdf.savefig(fig, bbox_inches='tight')
pdf.close()

Found a solution after few tries.
table.reset_index(inplace=True)
worked in this case.
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
import pandas as pd
pdf = matplotlib.backends.backend_pdf.PdfPages("test.pdf")
fig = plt.figure(figsize=(20, 20))
grid = plt.GridSpec(1, 2, wspace=0.2,width_ratios=[14, 6])
plt.autoscale()
ax0 = fig.add_subplot(grid[0 ,0])
ax1 = fig.add_subplot(grid[0, 1])
df = pd.DataFrame({'country': ['ABC','PQR','XYZ','ABC','PQR'], 'region': ['D','E','F','D','F'], 'month_day':[1,1,1,2,3],'sales' : [100,200,300,500,100]})
table=pd.pivot_table(df, values='sales', index=['country','region'], columns=['month_day'], aggfunc=sum, fill_value=0)
table.reset_index(inplace=True)
the_table = ax0.table(cellText=table.values,colLabels=table.columns,colWidths=[0.07,0.06,0.04,0.04,0.04],loc='center')
ax0.axis("off")
ax1.axis("off")
plt.axis("off")
pdf.savefig(fig, bbox_inches='tight')
pdf.close()

Related

Understading why looping subplot does not work with loc()

Considering the following toy dataset:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 1)), columns=["A"])`
my_ID = ["control", "sample1", "sample2", "sample3"]
new_ID = 25 * my_ID
df["new_ID"] = new_ID
I make a subplot for df["A"] based on the unique values of df["new_ID"] as follows:
unique_list = list(df["new_ID"].unique())
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
sns.histplot(data=df[df["new_ID"] == unique_sample], x= "A", ax=ax)
ax.set_title(unique_sample)
The code functions well as we use df[df["new_ID"]].
However, the script will give us four same subplots if we write it as:
d = df.loc[df["new_ID"] == unique_sample]
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
g1 = sns.histplot(data=d, x= "A", ax=ax)
ax.set_title(unique_sample)
I am wondering why loc() cannot be used in iteration, although, we can use it in making single plots.
Cannot reproduce unfortunately. Using your code - there are four different plots. seaborn 0.11.2, pandas 1.3.5.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randint(0,100,size=(100, 1)), columns=["A"])
my_ID = ["control", "sample1", "sample2", "sample3"]
new_ID = 25 * my_ID
df["new_ID"] = new_ID
unique_list = df["new_ID"].unique()
print(unique_list)
#I make a subplot for df["A"] based on the unique values of df["new_ID"] as follows:
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
sns.histplot(data=df[df["new_ID"] == unique_sample], x= "A", ax=ax)
ax.set_title(unique_sample)
plt.tight_layout()
For what it's worth, consider using sns.displot(kind="hist", ...) for this job! It has built in support for facet grid and will do the "facetting" by new_ID easily for you, you don't have to manage the axes manually.
This is all it takes to get the equivalent display:
sns.displot(data=df, col="new_ID", col_wrap=2, x="A");

Matplotlib - plotting grouped values with a for loop

I'm trying to plot a graph grouped by column values using a for loop without knowing the number of unique values in that column.
You can see sample code below (without a for loop) and the desired output.
I would like that each plot will have different color and marker (as seen below).
This is the code:
import pandas as pd
from numpy import random
df = pd.DataFrame(data = random.randn(5,4), index = ['A','B','C','D','E'],
columns = ['W','X','Y','Z'])
df['W'] = ['10/01/2018 12:00:00','10/03/2018 13:00:00',
'10/03/2018 12:30:00','10/04/2018 12:05:00',
'10/08/2018 12:00:15']
df['W']=pd.to_datetime(df['W'])
df['Entity'] = ['C201','C201','C201','C202','C202']
print(df.head())
fig, ax = plt.subplots()
df[df['Entity']=="C201"].plot(x="W",y="Y",label='C201',ax=ax,marker='x')
df[df['Entity']=="C202"].plot(x="W",y="Y",label='C202',ax=ax, marker='o')
This is the output:
You can first find out the unique values of your df['Entity'] and then loop over them. To generate new markers automatically for each Entity, you can define an order of some markers (let's say 5 in the answer below) which will repeat via marker=next(marker).
Complete minimal answer
import itertools
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
marker = itertools.cycle(('+', 'o', '*', '^', 's'))
df = pd.DataFrame(data = random.randn(5,4), index = ['A','B','C','D','E'],
columns = ['W','X','Y','Z'])
df['W'] = ['10/01/2018 12:00:00','10/03/2018 13:00:00',
'10/03/2018 12:30:00','10/04/2018 12:05:00',
'10/08/2018 12:00:15']
df['W']=pd.to_datetime(df['W'])
df['Entity'] = ['C201','C201','C201','C202','C202']
fig, ax = plt.subplots()
for idy in np.unique(df['Entity'].values):
df[df['Entity']==idy].plot(x="W",y="Y", label=idy, ax=ax, marker=next(marker))
plt.legend()
plt.show()

Add text annotation to plot from a pandas dataframe

My Code:
import matplotlib.pyplot as plt
import pandas as pd
import os, glob
path = r'C:/Users/New folder'
all_files = glob.glob(os.path.join(path, "*.txt"))
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=',', parse_dates=[0], infer_datetime_format=True,header=None, usecols=[0,1,2,3,4,5,6], names=['Date','Time','open', 'high', 'low', 'close','volume','tradingsymbol'])
df = df[['Date','Time','close','volume','tradingsymbol']]
df["Time"] = pd.to_datetime(df['Time'])
df.set_index('Time', inplace=True)
print(df)
fig, axes = plt.subplots(nrows=2, ncols=1)
################### Volume ###########################
df.groupby('tradingsymbol')['volume'].plot(legend=True, rot=0, grid=True, ax=axes[0])
################### PRICE ###########################
df.groupby('tradingsymbol')['close'].plot(legend=True, rot=0, grid=True, ax=axes[1])
plt.show()
My Current Output is like:
I need add text annotation to matplotlib plot. My desired output similar to below image:
It's hard to answer this question without access to your dataset, or a simpler example. However, I'll try my best.
Let's begin by setting up a dataframe which may or may resemble your data:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 3)),
columns=['a', 'b', 'c'])
With the dataset we'll now proceed to plot it with
fig, ax = plt.subplots(1, 1)
df.plot(legend=True, ax=ax)
Finally, we'll loop over the columns and annotate each datapoint as
for col in df.columns:
for id, val in enumerate(df[col]):
ax.text(id, val, str(val))
This gave me the plot following plot, which resembles your desired figure.

multiple boxplots by date in index

My dataframe
index Dates Hours_played
0 2014-11-06 11
1 2014-12-06 4
2 2015-09-06 5
3 2015-97-06 5
Then, I set Dates as index:
Hours_played
Dates
2014-11-06 11
2014-12-06 4
2015-09-06 5
2015-97-06 5
The Problem: When I tried to create one box plot for each year found in index, I got both plots on the same grid.
df.loc['2014']['Hours_played'].plot.box(ylim=(0,200))
df.loc['2015']['Hours_played'].plot.box(ylim=(0,200))
I tried the following but the plot comes up empty:
data_2015 = df.loc['2015']['Hours_played']
data_2016 = df.loc['2016']['Hours_played']
data_to_plot = [data_2015, data_2016]
mpl_fig = plt.figure()
ax = mpl_fig.add_subplot(111)
ax.boxplot(data_to_plot)
ax.set_ylim(0,300)
Is it possible to have them in the same grid, one by the other?
A simple solution will be grouping by year first and then making boxplot:
import io
import matplotlib.pyplot as plt
import pandas as pd
# Re-create your sample data
s = """Dates,Hours_played
2014-11-06,11
2014-12-06,4
2015-09-06,5
2015-07-06,5"""
df = pd.read_table(io.StringIO(s), sep=',', index_col=0, parse_dates=True)
# The following codes are the answer relevant to your question.
df.groupby(df.index.year).boxplot()
plt.show()
Your second method ends up with an empty plot because matplotlib fail to recognize pandas.DataFrame correctly. Try use Numpy-array representation:
import io
import matplotlib.pyplot as plt
import pandas as pd
# Re-create your sample data
s = """Dates,Hours_played
2014-11-06,11
2014-12-06,4
2015-09-06,5
2015-07-06,5"""
df = pd.read_table(io.StringIO(s), sep=',', index_col=0, parse_dates=True)
# The following codes are the answer relevant to your question.
data_2014 = df[df.index.year == 2014].as_matrix()
data_2015 = df[df.index.year == 2015].as_matrix()
data_to_plot = [data_2014, data_2015]
mpl_fig = plt.figure()
ax = mpl_fig.add_subplot(111)
ax.boxplot(data_to_plot)
plt.show()
To use subplots, you will need to plot them one by one:
import io
import matplotlib.pyplot as plt
import pandas as pd
# Re-create your sample data
s = """Dates,Hours_played
2014-11-06,11
2014-12-06,4
2015-09-06,5
2015-07-06,5"""
df = pd.read_table(io.StringIO(s), sep=',', parse_dates=[0])
df['Year'] = df.Dates.dt.year
df.set_index(['Year', 'Dates'], inplace=True)
# The following codes are the answer relevant to your question.
mpl_fig = plt.figure()
ax1 = mpl_fig.add_subplot(121)
ax1.boxplot(df.loc[2014]['Hours_played'], labels=[2014])
ax2 = mpl_fig.add_subplot(122)
ax2.boxplot(df.loc[2015]['Hours_played'], labels=[2015])
plt.show()
Let's reshape the data with Years in columns and boxplot:
df.set_index(['Dates',df.Dates.dt.year])['Hours_played'].unstack().boxplot()
If you want to put all the boxes in the same plot, you can do something like this:
import matplotlib.pyplot as plt
def setBoxColors(bp, num_plots):
color = ['red', 'blue', 'green']
for idx in range(num_plots):
plt.setp(bp['boxes'][idx], color=color[idx])
plt.setp(bp['caps'][2*idx], color=color[idx])
plt.setp(bp['caps'][2*idx+1], color=color[idx])
plt.setp(bp['whiskers'][2*idx], color=color[idx])
plt.setp(bp['whiskers'][2*idx+1], color=color[idx])
plt.setp(bp['fliers'][2*idx], color=color[idx])
plt.setp(bp['fliers'][2*idx+1], color=color[idx])
plt.setp(bp['medians'][idx], color=color[idx])
# Some fake data to plot
A = [[1, 2, 5,]]
B = [[3, 4, 5]]
C = [[1, 7, 10]]
fig = plt.figure()
ax = plt.axes()
plt.hold(True)
bp = plt.boxplot(A, positions = [2], widths = 0.6, patch_artist=True)
setBoxColors(bp, 1)
bp = plt.boxplot(B, positions = [6], widths = 0.6, patch_artist=True)
setBoxColors(bp, 1)
bp = plt.boxplot(C, positions = [10], widths = 0.6, patch_artist=True)
setBoxColors(bp, 1)
# set axes limits and labels
plt.xlim(0,12)
plt.ylim(0,12)
ax.set_xticklabels(['A', 'B', 'C'])
ax.set_xticks([2, 6, 10])
# draw temporary legend
hB, = plt.plot([1,1],'r-')
plt.legend((hB, ),('Type1', ))
hB.set_visible(False)
plt.show()
With the help of Scott Boston, Y. Luo, and yuhow5566, I was able to devise an interesting answer. From Scott, I learned that it's better not to index the Dates (keep them a regular column) for this type of boxplot; and from Y. Luo, I learned how to create a new column, while isolating the year from a datetime value.
df['Year'] = s['Dates'].dt.year
df.boxplot(column='Hours_played', by='Year', figsize=(9,9))

Matplotlib: cbar.set_xticklabels has no effects

I've assigned the 365 days of a year to several clusters and I'm now trying to plot them on a heatmap.
My code works fine except that cbar.set_ticks(some_range) has no effects: the tick labels on my colorbar have the right text but the wrong position
Here is a MCVE
from datetime import date
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import matplotlib
import seaborn as sns
#create some random data
n_cluster = 4
index = pd.date_range('01/01/2016', end='31/12/2016', freq='1D')
df = pd.DataFrame(np.random.randint(0, n_cluster, len(index)),
index=index, columns=['cluster'])
pivot = df.pivot_table('cluster',
columns=[lambda x: x.weekofyear],
index= [lambda x: x.dayofweek])
#yticklabels of the heatmap
days = [date(2018, 1, d).strftime('%a')[:3] for d in range(1, 8)]
#get a discrete cmap
cmap = plt.cm.get_cmap('RdBu', n_cluster)
fig = plt.figure(figsize=(10,3))
gs = matplotlib.gridspec.GridSpec(1, 2, width_ratios=[50,1])
ax = plt.subplot(gs[0])
cbar = plt.subplot(gs[1])
sns.heatmap(pivot, square=True, cmap=cmap,
yticklabels=days, ax=ax, cbar_ax=cbar)
#There is something wrong here
cbar.set_yticks([i + 1/(2.0*n_cluster) for i in np.arange(0, 1, 1.0/n_cluster)])
#This one is ok
cbar.set_yticklabels(range(0, n_cluster))
Thanks for your help
As a workaround, the following adds the correct labels in the correct place,
cbar.yaxis.set_ticks([0.125, 0.375, 0.625, 0.875])
which looks like,
EDIT:
Or the more general suggestion of mfitzp,
cbar.yaxis.set_ticks([i + 1/(2.0*n_cluster)
for i in np.arange(0, 1, 1.0/n_cluster)])

Categories

Resources