I have some data where I've manipulated the dataframe using the following code:
import pandas as pd
import numpy as np
data = pd.DataFrame([[0,0,0,3,6,5,6,1],[1,1,1,3,4,5,2,0],[2,1,0,3,6,5,6,1],[3,0,0,2,9,4,2,1],[4,0,1,3,4,8,1,1],[5,1,1,3,3,5,9,1],[6,1,0,3,3,5,6,1],[7,0,1,3,4,8,9,1]], columns=["id", "sex", "split", "group0Low", "group0High", "group1Low", "group1High", "trim"])
data
#remove all where trim == 0
trimmed = data[(data.trim == 1)]
trimmed
#create df with columns to be split
columns = ['group0Low', 'group0High', 'group1Low', 'group1High']
to_split = trimmed[columns]
to_split
level_group = np.where(to_split.columns.str.contains('0'), 0, 1)
# output: array([0, 0, 1, 1])
level_low_high = np.where(to_split.columns.str.contains('Low'), 'low', 'high')
# output: array(['low', 'high', 'low', 'high'], dtype='<U4')
multi_level_columns = pd.MultiIndex.from_arrays([level_group, level_low_high], names=['group', 'val'])
to_split.columns = multi_level_columns
to_split.stack(level='group')
sex = trimmed['sex']
split = trimmed['split']
horizontalStack = pd.concat([sex, split, to_split], axis=1)
horizontalStack
finalData = horizontalStack.groupby(['split', 'sex', 'group'])
finalData.mean()
My question is, how do I plot the mean data using ggplot or seaborn such that for each "split" level I get a graph that looks like this:
At the bottom of the code you can see I've tried to split up the group factor so I can separate the bars, but that resulted in an error (KeyError: 'group') and I think that is related to the way I used multi indexing
I would use a factor plot from seaborn.
Say you have data like this:
import numpy as np
import pandas
import seaborn
seaborn.set(style='ticks')
np.random.seed(0)
groups = ('Group 1', 'Group 2')
sexes = ('Male', 'Female')
means = ('Low', 'High')
index = pandas.MultiIndex.from_product(
[groups, sexes, means],
names=['Group', 'Sex', 'Mean']
)
values = np.random.randint(low=20, high=100, size=len(index))
data = pandas.DataFrame(data={'val': values}, index=index).reset_index()
print(data)
Group Sex Mean val
0 Group 1 Male Low 64
1 Group 1 Male High 67
2 Group 1 Female Low 84
3 Group 1 Female High 87
4 Group 2 Male Low 87
5 Group 2 Male High 29
6 Group 2 Female Low 41
7 Group 2 Female High 56
You can then create the factor plot with one command + plus an extra line to remove some redundant (for your data) x-labels:
fg = seaborn.factorplot(x='Group', y='val', hue='Mean',
col='Sex', data=data, kind='bar')
fg.set_xlabels('')
Which gives me:
In a related question I found an alternative solution by #Stein that codes the multiindex levels as different labels. Here is how it looks like for your example:
import pandas as pd
import matplotlib.pyplot as plt
from itertools import groupby
import numpy as np
%matplotlib inline
groups = ('Group 1', 'Group 2')
sexes = ('Male', 'Female')
means = ('Low', 'High')
index = pd.MultiIndex.from_product(
[groups, sexes, means],
names=['Group', 'Sex', 'Mean']
)
values = np.random.randint(low=20, high=100, size=len(index))
data = pd.DataFrame(data={'val': values}, index=index)
# unstack last level to plot two separate columns
data = data.unstack(level=-1)
def add_line(ax, xpos, ypos):
line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
transform=ax.transAxes, color='gray')
line.set_clip_on(False)
ax.add_line(line)
def label_len(my_index,level):
labels = my_index.get_level_values(level)
return [(k, sum(1 for i in g)) for k,g in groupby(labels)]
def label_group_bar_table(ax, df):
ypos = -.1
scale = 1./df.index.size
for level in range(df.index.nlevels)[::-1]:
pos = 0
for label, rpos in label_len(df.index,level):
lxpos = (pos + .5 * rpos)*scale
ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
add_line(ax, pos*scale, ypos)
pos += rpos
add_line(ax, pos*scale , ypos)
ypos -= .1
ax = data['val'].plot(kind='bar')
#Below 2 lines remove default labels
ax.set_xticklabels('')
ax.set_xlabel('')
label_group_bar_table(ax, data)
This gives:
Related
Dataset:
I have the above dataset. I need to plot a graph where the starting point of y-axis needs to be 0, irrespective of the values in the dataset. The x-axis is the index(Time) and y-axis is Jolt 1, Jolt 2,...
I have a graph showing 2 different plots (the blue curve starts at y=0). I want the orange plot to start from 0 too to compare the trends visually.
Graph:
Here is the code
import pandas as pd
import matplotlib.pyplot as plt
df.set_index('Time').plot()
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
How do I modify the code to get the desired output?
You can set the 'Time' column as the index of your dataframe:
df = df.set_index('Time')
then you can loop over the remaining columns and plot them with respect to the dataframe index. In order to make all lines start from 0, you have to subtract to each column the respective starting point at the index 0:
for column in df.columns:
ax.plot(df.index, df[column] - df.loc[0, column], label = column)
Complete Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame()
df['Time'] = np.arange(0, 0.6, 0.1)
df['Jolt 1'] = np.random.rand(len(df['Time']))
df['Jolt 2'] = np.random.rand(len(df['Time'])) + np.random.randint(low = -5, high = 5)
df['Jolt 3'] = np.random.rand(len(df['Time'])) + np.random.randint(low = -5, high = 5)
df = df.set_index('Time')
fig, ax = plt.subplots()
for column in df.columns:
ax.plot(df.index, df[column] - df.loc[0, column], label = column)
ax.legend(frameon = True)
plt.show()
Plot
This is a question about how to properly organize subplots, not how to create stacked bars.
I have the following dataframe:
corpus group mono p non p plus p minus p
0 fairview all 49 51 49 0
1 i2b2 all 46 54 46 0
2 mipacq all 44 56 43 1
and want to arrange the output as given in the two attached figures so that I get ncolumns and 2-rows, instead of two separate subplots with 1 row each (so in this case, there would be 2-rows, 3-columns on a single subplot instead of 1-row, 3-columns on 2 subplots):
I am generating these two figures as separate subplots using the following code:
data = <above dataframe>
semgroups = ['all']
corpus = ['fairview', 'i2b2', 'mipacq']
for sg in semgroups:
i = semgroups.index(sg)
ix = i + 7
ncols = len(set(data.corpus.tolist()))
nrows = len(set(data.group.tolist()))
fig = plt.figure()
fig, axs = plt.subplots(1, ncols, sharey=True)
for ax,(idx,row) in zip(axs.flat, data.iterrows()):
# I WANT TO PLOT BOTH ROWS on same subplot
#row[['mono p', 'non p']].plot.bar(ax=ax, color=['C0','C1'])
row[['plus p', 'minus p']].plot.bar(ax=ax, color=['C0','C1'])
if row['corpus'] == 'fairview':
corpus = 'Fairview'
label = '(d) '
elif row['corpus'] == 'mipacq':
corpus = 'MiPACQ'
if ncols == 3:
label = '(f) '
else:
label = '(b) '
else:
corpus = 'i2b2'
label = '(e) '
ax.set_title(label + corpus)
ax.tick_params(axis='x', labelrotation = 45)
if sg == 'all':
sg = 'All groups'
# Defining custom 'xlim' and 'ylim' values.
custom_ylim = (0, 60)
# Setting the values for all axes.
plt.setp(axs, ylim=custom_ylim)
fig.suptitle('Figure ' + str(ix) + ' ' + sg)
In the code above, I iterate through my df grabbing the following rows to generate both separate subplots:
# BUT, I WANT TO PLOT BOTH ROWS ON SAME SUBPLOT
row[['mono p', 'non p']].plot.bar(ax=ax, color=['C0','C1'])
row[['plus p', 'minus p']].plot.bar(ax=ax, color=['C0','C1'])
No matter how I do this I cannot get the desired two rows in a single subplot(I always get an empty row of plots with no data on the second row).
See inline comments
Tested in python 3.8.12, pandas 1.3.3, matplotlib 3.4.3, seaborn 0.11.2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # seaborn is a high-level api for matplotlib
# sample dataframe
data = {'corpus': ['fairview', 'i2b2', 'mipacq'], 'group': ['all', 'all', 'all'], 'mono p': [49, 46, 44], 'non p': [51, 54, 56], 'plus p': [49, 46, 43], 'minus p': [0, 0, 1]}
df = pd.DataFrame(data)
semgroups = df.group.unique() # unique groups
corpus = df.corpus.unique() # unique corpus
rows = [['mono p', 'non p'], ['plus p', 'minus p']] # columns for each row of plots
for sg in semgroups:
i = semgroups.index(sg)
ix = i + 7
ncols = len(corpus) # 3 columns for the example
nrows = len(rows) # 2 rows for the example
# create a figure with 2 rows of 3 columns: axes is a 2x3 array of <AxesSubplot:>
fig, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(12, 10))
# iterate through each plot row combined with a list from rows
for axe, row in zip(axes, rows):
# iterate through each plot column of the current row
for i, ax in enumerate(axe):
# select the data for each plot
data = df.loc[df.group.eq(sg) & df.corpus.eq(corpus[i]), row]
# plot the dataframe, but setting the bar color is more difficult
# data.T.plot(kind='bar', legend=False, ax=ax)
# plot the data with seaborn, which is easier to color the bars
sns.barplot(data=data, ax=ax)
if corpus[i] == 'fairview':
l2 = 'Fairview'
l1 = '(d) '
elif corpus[i] == 'mipacq':
l2 = 'MiPACQ'
if ncols == 3:
l1 = '(f) '
else:
l1 = '(b) '
else:
l2 = 'i2b2'
l1 = '(e) '
ax.set_title(l1 + l2)
ax.tick_params(axis='x', labelrotation = 45)
if sg == 'all':
sg = 'All groups'
# Defining custom 'xlim' and 'ylim' values.
custom_ylim = (0, 60)
# Setting the values for all axes.
plt.setp(axes, ylim=custom_ylim)
fig.suptitle('Figure ' + str(ix) + ' ' + sg)
fig.tight_layout()
plt.show()
I am trying to create multiple box plot charts for about 5 columns in my dataframe (df_summ):
columns = ['dimension_a','dimension_b']
for i in columns:
sns.set(style = "ticks", palette = "pastel")
box_plot = sns.boxplot(y="measure", x=i,
palette=["m","g"],
data=df_summ_1500_delta)
sns.despine(offset=10, trim=True)
medians = df_summ_1500_delta.groupby([i])['measure'].median()
vertical_offset=df_summ_1500_delta['measure'].median()*-0.5
for xtick in box_plot.get_xticks():
box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
horizontalalignment='center',size='small',color='blue',weight='semibold')
My only issue is that they aren't be separated on different facets, but rather on top of each other.
Any help on how I can make both on their own separate chart with the x axis being 'dimension a' and the x axis of the second chart being 'dimension b'.
To draw two boxplots next to each other at each x-position, you can use a hue for dimension_a and dimension_b separately. These two columns need to be transformed (with pd.melt()) to "long form".
Here is a some example code starting from generated test data. Note that the order both for the x-values as for the hue-values needs to be enforced to be sure of their exact position. The individual box plots are distributed over a width of 0.8.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
df = pd.DataFrame({'dimension_a': np.random.choice(['hot', 'cold'], 100),
'dimension_b': np.random.choice(['hot', 'cold'], 100),
'measure': np.random.uniform(100, 500, 100)})
df.loc[df['dimension_a'] == 'hot', 'measure'] += 100
df.loc[df['dimension_a'] == 'cold', 'measure'] -= 100
x_order = ['hot', 'cold']
columns = ['dimension_a', 'dimension_b']
df1 = df.melt(value_vars=columns, var_name='dimension', value_name='value', id_vars='measure')
sns.set(style="ticks", palette="pastel")
ax = sns.boxplot(data=df1, x='value', order=x_order, y='measure',
hue='dimension', hue_order=columns, palette=["m", "g"], dodge=True)
ax.set_xlabel('')
sns.despine(offset=10, trim=True)
for col, dodge_dist in zip(columns, np.linspace(-0.4, 0.4, 2 * len(x_order) + 1)[1::2]):
medians = df.groupby([col])['measure'].median()
vertical_offset = df['measure'].median() * -0.5
for x_ind, xtick in enumerate(x_order):
ax.text(x_ind + dodge_dist, medians[xtick] + vertical_offset, f'{medians[xtick]:.2f}',
horizontalalignment='center', size='small', color='blue', weight='semibold')
plt.show()
I am trying to plot a dynamic portfolio performance that changes as the weights of the portfolio change
Assume a portfolio has 2 components with a 50% weighting each. I want to show a chart of the portfolio with sliders representing the weights of the components. I then want to be able to slide the values of the weights around and have the portfolio chart dynamically update.
I have done this for a portfolio that consists of one weight but cant figure out how to amend for more than 1 weight - maybe I need a different approach.
Example below substitutes a random df with 1 column in place of my portfolio df - process should be the same.
In terms of this example if the df had 2 columns - how can I get it working with 2 sliders controlling each weight ?
from bqplot import DateScale, LinearScale, Axis, Figure, Lines
from ipywidgets import FloatSlider, VBox
import pandas as pd
import numpy as np
slider = FloatSlider(value=1, min = 0, max = 1, step = .01, description = 'Weight A')
df = pd.DataFrame(np.random.randint(0,100,size=(100, 1)), columns=list('A'))
x_sc = LinearScale()
y_sc = LinearScale()
ax_x = Axis(label='Date', scale=x_sc, grid_lines='solid')
ax_y = Axis(label='Price', scale=y_sc, orientation='vertical', grid_lines='solid')
line = Lines(y=df['A'],x=df.index , scales={'x': x_sc, 'y': y_sc}, colors = ['#FF0000'])
line2 = Lines(y=df['A'],x=df.index , scales={'x': x_sc, 'y': y_sc})
fig = Figure(axes=[ax_x, ax_y], marks=[line, line2], title='Price Chart')
def new_chart(value):
new_y = df[['A']]*slider.value
line.y = new_y
slider.observe(new_chart,'value')
VBox([fig,slider])
Not sure if I have understood you. Do you mean this?
from bqplot import DateScale, LinearScale, Axis, Figure, Lines
from ipywidgets import FloatSlider, VBox
import pandas as pd
import numpy as np
slider = FloatSlider(value=1, min = 0, max = 1, step = .01, description = 'Weight A')
sliderB = FloatSlider(value=1, min = 0, max = 1, step = .01, description = 'Weight B')
df = pd.DataFrame(np.random.randint(0,100,size=(100, 1)), columns=list('A'))
df['B'] = np.random.randint(0,100,size=(100, 1))
x_sc = LinearScale()
y_sc = LinearScale()
ax_x = Axis(label='Date', scale=x_sc, grid_lines='solid')
ax_y = Axis(label='Price', scale=y_sc, orientation='vertical', grid_lines='solid')
line = Lines(y=df['A']+df['B'],x=df.index , scales={'x': x_sc, 'y': y_sc}, colors = ['#FF0000'])
line2 = Lines(y=df['A']+df['B'],x=df.index , scales={'x': x_sc, 'y': y_sc})
fig = Figure(axes=[ax_x, ax_y], marks=[line, line2, ], title='Price Chart')
def new_chart(change):
line.y = df['A']*slider.value + df['B']*sliderB.value
slider.observe(new_chart,'value')
sliderB.observe(new_chart,'value')
VBox([fig,slider,sliderB])
I'm trying to add a bar-plot (stacked or otherwise) for each row in a seaborn clustermap.
Let's say that I have a dataframe like this:
import pandas as pd
import numpy as np
import random
df = pd.DataFrame(np.random.randint(0,100,size=(100, 8)), columns=["heatMap_1","heatMap_2","heatMap_3","heatMap_4","heatMap_5", "barPlot_1","barPlot_1","barPlot_1"])
df['index'] = [ random.randint(1,10000000) for k in df.index]
df.set_index('index', inplace=True)
df.head()
heatMap_1 heatMap_2 heatMap_3 heatMap_4 heatMap_5 barPlot_1 barPlot_1 barPlot_1
index
4552288 9 3 54 37 23 42 94 31
6915023 7 47 59 92 70 96 39 59
2988122 91 29 59 79 68 64 55 5
5060540 68 80 25 95 80 58 72 57
2901025 86 63 36 8 33 17 79 86
I can use the first 5 columns (in this example starting with prefix heatmap_) to create seaborn clustermap using this(or the seaborn equivalent):
sns.clustermap(df.iloc[:,0:5], )
and the stacked barplot for last four columns(in this example starting with prefix barPlot_) using this:
df.iloc[:,5:8].plot(kind='bar', stacked=True)
but I'm a bit confused on how to merge both plot types. I understand that clustermap creates it's own figures and I'm not sure if I can extract just the heatmap from clustermap and then use it with subfigures. (Discussed here: Adding seaborn clustermap to figure with other plots). This creates a weird output.
Edit:
Using this:
import pandas as pd
import numpy as np
import random
import seaborn as sns; sns.set(color_codes=True)
import matplotlib.pyplot as plt
import matplotlib.gridspec
df = pd.DataFrame(np.random.randint(0,100,size=(100, 8)), columns=["heatMap_1","heatMap_2","heatMap_3","heatMap_4","heatMap_5", "barPlot_1","barPlot_2","barPlot_3"])
df['index'] = [ random.randint(1,10000000) for k in df.index]
df.set_index('index', inplace=True)
g = sns.clustermap(df.iloc[:,0:5], )
g.gs.update(left=0.05, right=0.45)
gs2 = matplotlib.gridspec.GridSpec(1,1, left=0.6)
ax2 = g.fig.add_subplot(gs2[0])
df.iloc[:,5:8].plot(kind='barh', stacked=True, ax=ax2)
creates this:
which does not really match well (i.e. due to dendrograms there is a shift).
Another options is to manually perform clustering and create a matplotlib heatmap and then add associated subfigures like barplots(discussed here:How to get flat clustering corresponding to color clusters in the dendrogram created by scipy)
Is there a way I can use clustermap as a subplot along with other plots ?
This is the result I'm looking for[1]:
While not a proper answer, I decided to break it down and do everything manually.
Taking inspiration from answer here, I decided to cluster and reorder the heatmap separately:
def heatMapCluter(df):
row_method = "ward"
column_method = "ward"
row_metric = "euclidean"
column_metric = "euclidean"
if column_method == "ward":
d2 = dist.pdist(df.transpose())
D2 = dist.squareform(d2)
Y2 = sch.linkage(D2, method=column_method, metric=column_metric)
Z2 = sch.dendrogram(Y2, no_plot=True)
ind2 = sch.fcluster(Y2, 0.7 * max(Y2[:, 2]), "distance")
idx2 = Z2["leaves"]
df = df.iloc[:, idx2]
ind2 = ind2[idx2]
else:
idx2 = range(df.shape[1])
if row_method:
d1 = dist.pdist(df)
D1 = dist.squareform(d1)
Y1 = sch.linkage(D1, method=row_method, metric=row_metric)
Z1 = sch.dendrogram(Y1, orientation="right", no_plot=True)
ind1 = sch.fcluster(Y1, 0.7 * max(Y1[:, 2]), "distance")
idx1 = Z1["leaves"]
df = df.iloc[idx1, :]
ind1 = ind1[idx1]
else:
idx1 = range(df.shape[0])
return df
Rearranged the original dataframe:
clusteredHeatmap = heatMapCluter(df.iloc[:, 0:5].copy())
# Extract the "barplot" rows and merge them
clusteredDataframe = df.reindex(list(clusteredHeatmap.index.values))
clusteredDataframe = clusteredDataframe.reindex(
list(clusteredHeatmap.columns.values)
+ list(df.iloc[:, 5:8].columns.values),
axis=1,
)
and then used the gridspec to plot both "subfigures" (clustermap and barplot):
# Now let's plot this - first the heatmap and then the barplot.
# Since it is a "two" part plot which shares the same axis, it is
# better to use gridspec
fig = plt.figure(figsize=(12, 12))
gs = GridSpec(3, 3)
gs.update(wspace=0.015, hspace=0.05)
ax_main = plt.subplot(gs[0:3, :2])
ax_yDist = plt.subplot(gs[0:3, 2], sharey=ax_main)
im = ax_main.imshow(
clusteredDataframe.iloc[:, 0:5],
cmap="Greens",
interpolation="nearest",
aspect="auto",
)
clusteredDataframe.iloc[:, 5:8].plot(
kind="barh", stacked=True, ax=ax_yDist, sharey=True
)
ax_yDist.spines["right"].set_color("none")
ax_yDist.spines["top"].set_color("none")
ax_yDist.spines["left"].set_visible(False)
ax_yDist.xaxis.set_ticks_position("bottom")
ax_yDist.set_xlim([0, 100])
ax_yDist.set_yticks([])
ax_yDist.xaxis.grid(False)
ax_yDist.yaxis.grid(False)
Jupyter notebook: https://gist.github.com/siddharthst/2a8b7028d18935860062ac7379b9279f
Image:
1 - http://code.activestate.com/recipes/578175-hierarchical-clustering-heatmap-python/