two DataFrame plots - python

I have a similar plot to the one answered in the link below:
two DataFrame plot in a single plot matplotlip
I made some modification to plots for df2 columns code block because i think that is where i have to modify but i could not yield the output.
a sample of the plot i want is this
this was how i modified it:
f, axes = plt.subplots(nrows=len(signals.columns)+1, sharex=True, )
i = 0
for col in df2.columns:
fig, axs = plt.subplots()
sns.regplot(x='', y='', data=df2, ax=axs[0])
df2[col].plot(ax=axes[i], color='grey')
axes[i].set_ylabel(col)
i+=1
I have seen that its wrong.
I tried this out, it seems like a head way :)
How do I make modification on this to get what i want:
f, axes = plt.subplots(nrows=len(signals.columns)+1, sharex=True, )
# plots for df2 columns
i = 0
for col in df2.columns:
lw=1
df2[col].plot(ax=axes[i], color='grey')
axes[i].set_ylim(0, 1)
axes[i].set_ylabel(col)
sns.rugplot(df2["P1"])

You have several options to make this graph. df1 and df2 are as defined in your previous question
The version with matplotlib.pyplot.scatter is faster to draw, but less faithful to the example. The version with seaborn.rugplot looks identical to the example, but takes longer to draw. I highlighted the important part of the code between comment lines ########
using matplotlib.pyplot.scatter
import seaborn as sns
import numpy as np
f, axes = plt.subplots(nrows=len(df2.columns)+1, sharex=True,
gridspec_kw={'height_ratios':np.append(np.repeat(1, len(df2.columns)), 3)})
####### variable part below #######
# plots for df2 columns
i = 0
for col in df2.columns:
axes[i].scatter(x=df2.index, y=np.repeat(0, len(df2)), c=df2[col], marker='|', cmap='Greys')
axes[i].set_ylim(-0.5, 0.5)
axes[i].set_yticks([0])
axes[i].set_yticklabels([col])
i+=1
###################################
## code to plot annotations
axes[-1].set_xlabel('Genomic position')
axes[-1].set_ylabel('annotations')
axes[-1].set_ylim(-0.5, 1.5)
axes[-1].set_yticks([0, 1])
axes[-1].set_yticklabels(['−', '+'])
for _, r in df1.iterrows():
marker = '|'
lw=1
if r['type'] == 'exon':
marker=None
lw=8
y = 1 if r['strand'] == '+' else 0
axes[-1].plot((r['start'], r['stop']), (y, y),
marker=marker, lw=lw,
solid_capstyle='butt',
color='#505050')
# remove space between plots
plt.subplots_adjust(hspace=0)
axes[-1].set_xlim(0, len(df2))
f.set_size_inches(6, 2)
using seaborn.rugplot
import seaborn as sns
import numpy as np
f, axes = plt.subplots(nrows=len(df2.columns)+1, sharex=True,
gridspec_kw={'height_ratios':np.append(np.repeat(1, len(df2.columns)), 3)})
####### variable part below #######
import matplotlib
import matplotlib.cm as cm
norm = matplotlib.colors.Normalize(vmin=0, vmax=1, clip=True)
mapper = cm.ScalarMappable(norm=norm, cmap=cm.Greys)
# plots for df2 columns
i = 0
for col in df2.columns:
sns.rugplot(x=df2.index, color=list(map(mapper.to_rgba, df2[col])), height=1, ax=axes[i])
axes[i].set_yticks([0])
axes[i].set_yticklabels([col])
i+=1
###################################
## code to plot annotations
axes[-1].set_xlabel('Genomic position')
axes[-1].set_ylabel('annotations')
axes[-1].set_ylim(-0.5, 1.5)
axes[-1].set_yticks([0, 1])
axes[-1].set_yticklabels(['−', '+'])
for _, r in df1.iterrows():
marker = '|'
lw=1
if r['type'] == 'exon':
marker=None
lw=8
y = 1 if r['strand'] == '+' else 0
axes[-1].plot((r['start'], r['stop']), (y, y),
marker=marker, lw=lw,
solid_capstyle='butt',
color='#505050')
# remove space between plots
plt.subplots_adjust(hspace=0)
axes[-1].set_xlim(0, len(df2))
f.set_size_inches(6, 2)

Related

How should a nested loop statement to create vertical lines in a for loop statement that creates histograms work?

I am trying to use a for loop to create histograms for each fields in a dataframe. The dataframe here is labeled as 'df4'.
There are 3 fields/columns.
Then I want to create vertical lines using quantiles for each of the columns as defined in the following series: p, exp, eng.
My code below only successfully creates the vertical lines on the last field/column or histogram.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df4 = pd.read_csv("xyz.csv", index_col = "abc_id" )
# dataframe
# x coordinates for the lines
p = df4['abc'].quantile([0.25,0.5,0.75,0.9,0.95])
exp = df4['efg'].quantile([0.25,0.5,0.75,0.9,0.95])
eng = df4['xyz'].quantile([0.25,0.5,0.75,0.9,0.95])
# colors for the lines
colors = ['r','k','b','g','y']
bins = [0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000]
fig, axs = plt.subplots(len(df4.columns), figsize=(10, 25))
for n, col in enumerate(df4.columns):
if (n==0):
for xc,c in zip(exp,colors):
plt.axvline(x=xc, label='line at x = {}'.format(xc), c=c)
if (n==1):
for xc,c in zip(eng,colors):
plt.axvline(x=xc, label='line at x = {}'.format(xc), c=c)
if (n==2):
for xc,c in zip(p,colors):
plt.axvline(x=xc, label='line at x = {}'.format(xc), c=c)
df[col].hist(ax=axs[n],bins=50)
plt.legend()
plt.show()

How to automatize imshow plots in python

I want to automatize an imshow degrading figure with python3. I would like to give a data frame and this to be plot no matter how many columns are given.
I tried this:
vmin = 3.5
vmax = 6
fig, axes = plt.subplots(len(list(df.columns)),1)
for i,j in zip(list(df.columns),range(1,len(list(df.columns))+1)):
df = df.sort_values([i], ascending = False)
y = df[i].tolist()
gradient = [y,y]
plt.imshow(gradient, aspect='auto', cmap=plt.get_cmap('hot_r'), vmin=vmin, vmax=vmax)
axes = plt.subplot(len(list(df.columns)),1,j)
sm = plt.cm.ScalarMappable(cmap=plt.get_cmap('hot_r'),norm=plt.Normalize(vmin,vmax))
sm._A = []
plt.colorbar(sm,ax=axes)
plt.show()
My problem is that the first set of data (first column of the df) is never showed. Also the map is not where I want it to be. This is exactly what I get:
But this is what I want:
You shouldn't use plt.subplot if you already have created your subplots via plt.subplots.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
f = lambda x, s: x*np.exp(-x**2/s)/2
df = pd.DataFrame({"A" : f(np.linspace(0,50,600),70)+3.5,
"B" : f(np.linspace(0,50,600),110)+3.5,
"C" : f(np.linspace(0,50,600),150)+3.5,})
vmin = 3.5
vmax = 6
fig, axes = plt.subplots(len(list(df.columns)),1)
for col, ax in zip(df.columns,axes.flat):
df = df.sort_values([col], ascending = False)
y = df[col].values
gradient = [y,y]
im = ax.imshow(gradient, aspect='auto',
cmap=plt.get_cmap('hot_r'), vmin=vmin, vmax=vmax)
# Since all images have the same vmin/vmax, we can take any of them for the colorbar
fig.colorbar(im, ax=axes)
plt.show()

How to add cross (X) on a heatmap cells like with R language?

I would like to add cross (X) on heatmap cells (depending on significance level, but the question is on adding the X).
Like in R-language (sig.level = XXX).
See the Python and R code used and the corresponding output images.
Thank you for your help.
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, square=True, linewidths=0.5, fmt=".2f",
cbar_kws={"shrink": .65, "orientation": "horizontal", "ticks":np.arange(-1, 1+1, 0.2)},
annot = True, annot_kws={"weight": 'bold', "size":15})
corrplot(cor(subset (wqw, select =
c(fixed.acidity:quality,ratio.sulfur.dioxide))),
# compute the p matrix
p.mat = cor.mtest(subset
(wqw, select = c(fixed.acidity:quality,ratio.sulfur.dioxide))),
# significance level 0.01
sig.level = 0.01,
# Method to display : color (could be corcle, ...)
method = "color",
# color palette
col = colorRampPalette(c("#BB4444", "#EE9988",
"#FFFFFF", "#77AADD", "#4477AA"))(200),
)
```
The easy solution is to add a scatter plot with an X-shaped marker to cross out the unwanted cells.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
data = np.random.rand(10,10)
mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = True
data_masked = np.ma.array(data, mask=mask)
fig, ax = plt.subplots()
im = ax.imshow(data_masked, cmap="YlGnBu", origin="upper")
fig.colorbar(im)
ax.scatter(*np.argwhere(data_masked.T < 0.4).T, marker="x", color="black", s=100)
plt.show()
The drawback of this is that the markersize (s) is independent of the number of cells and needs to be adjusted for different figure sizes.
An alternative is hence to draw some lines (an X are two crossed lines) at the respective positions. Here we create a function crossout(points, ax=None, scale=1, **kwargs), where scale is the percentage the lines shall take from each cell.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
def crossout(points, ax=None, scale=1, **kwargs):
ax = ax or plt.gca()
l = np.array([[[1,1],[-1,-1]]])*scale/2.
r = np.array([[[-1,1],[1,-1]]])*scale/2.
p = np.atleast_3d(points).transpose(0,2,1)
c = LineCollection(np.concatenate((l+p,r+p), axis=0), **kwargs)
ax.add_collection(c)
return c
data = np.random.rand(10,10)
mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = True
data_masked = np.ma.array(data, mask=mask)
fig, ax = plt.subplots()
im = ax.imshow(data_masked, cmap="YlGnBu", origin="upper")
fig.colorbar(im)
crossout(np.argwhere(data_masked.T < 0.4), ax=ax, scale=0.8, color="black")
plt.show()
For scale=0.8 this looks like
Note that for a pcolormesh plot or a seaborn heatmap (which uses pcolormesh internally), one would need to add 0.5 to the data, i.e.
np.argwhere(data_masked.T < 0.4)+0.5

Pandas combine multiple subplots with same x axis into 1 bar chart

I am looping through a list containing 6 col_names. I loop by taking 3 cols at a time so i can print 3 subplots per iteration later.
I have 2 dataframes with same column names so they look identical except for the histograms of each column name.
I want to plot similar column names of both dataframes on the same subplot. Right now, im plotting their histograms on 2 separate subplots.
currently, for col 'A','B','C' in df_plot:
and for col 'A','B','C' in df_plot2:
I only want 3 charts where i can combine similar column names into same chart so there is blue and yellow bars in the same chart.
Adding df_plot2 below doesnt work. i think im not defining my second axs properly but im not sure how to do that.
col_name_list = ['A','B','C','D','E','F']
chunk_list = [col_name_list[i:i + 3] for i in xrange(0, len(col_name_list), 3)]
for k,g in enumerate(chunk_list):
df_plot = df[g]
df_plot2 = df[g][df[g] != 0]
fig, axs = plt.subplots(1,len(g),figsize = (50,20))
axs = axs.ravel()
for j,x in enumerate(g):
df_plot[x].value_counts(normalize=True).head().plot(kind='bar',ax=axs[j], position=0, title = x, fontsize = 30)
# adding this doesnt work.
df_plot2[x].value_counts(normalize=True).head().plot(kind='bar',ax=axs[j], position=1, fontsize = 30)
axs[j].title.set_size(40)
fig.tight_layout()
the solution is to plot on the same ax:
change axs[j] to axs
for k,g in enumerate(chunk_list):
df_plot = df[g]
df_plot2 = df[g][df[g] != 0]
fig, axs = plt.subplots(1,len(g),figsize = (50,20))
axs = axs.ravel()
for j,x in enumerate(g):
df_plot[x].value_counts(normalize=True).head().plot(kind='bar',ax=axs, position=0, title = x, fontsize = 30)
# adding this doesnt work.
df_plot2[x].value_counts(normalize=True).head().plot(kind='bar',ax=axs, position=1, fontsize = 30)
axs[j].title.set_size(40)
fig.tight_layout()
then just call plt.plot()
Example this will plot x and y on the same subplot:
import matplotlib.pyplot as plt
x = np.arange(0, 10, 1)
y = np.arange(0, 20, 2)
ax = plt.subplot(1,1)
fig = plt.figure()
ax = fig.gca()
ax.plot(x)
ax.plot(y)
plt.show()
EDIT:
There is now a squeeze keyword argument. This makes sure the result is always a 2D numpy array.
fig, ax2d = subplots(2, 2, squeeze=False)
if needed Turning that into a 1D array is easy:
axli = ax1d.flatten()

Group labels in matplotlib barchart using Pandas MultiIndex

I have a pandas DataFrame with a MultiIndex:
group subgroup obs_1 obs_2
GroupA Elem1 4 0
Elem2 34 2
Elem3 0 10
GroupB Elem4 5 21
and so on. As noted in this SO question this is actually doable in matplotlib, but I'd rather (if possible) use the fact that I already know the hierarchy (thanks to the MultiIndex). Currently what's happening is that the index is shown as a tuple.
Is such a thing possible?
If you have just two levels in the MultiIndex, I believe the following will be easier:
plt.figure()
ax = plt.gca()
DF.plot(kind='bar', ax=ax)
plt.grid(True, 'both')
minor_XT = ax.get_xaxis().get_majorticklocs()
DF['XT_V'] = minor_XT
major_XT = DF.groupby(by=DF.index.get_level_values(0)).first()['XT_V'].tolist()
DF.__delitem__('XT_V')
ax.set_xticks(minor_XT, minor=True)
ax.set_xticklabels(DF.index.get_level_values(1), minor=True)
ax.tick_params(which='major', pad=15)
_ = plt.xticks(major_XT, (DF.index.get_level_values(0)).unique(), rotation=0)
And a bit of involving, but more general solution (doesn't matter how many levels you have):
def cvt_MIdx_tcklab(df):
Midx_ar = np.array(df.index.tolist())
Blank_ar = Midx_ar.copy()
col_idx = np.arange(Midx_ar.shape[0])
for i in range(Midx_ar.shape[1]):
val,idx = np.unique(Midx_ar[:, i], return_index=True)
Blank_ar[idx, i] = val
idx=~np.in1d(col_idx, idx)
Blank_ar[idx, i]=''
return map('\n'.join, np.fliplr(Blank_ar))
plt.figure()
ax = plt.gca()
DF.plot(kind='bar', ax=ax)
ax.set_xticklabels(cvt_MIdx_tcklab(DF), rotation=0)
I think that there isn't a nice and standard way of plotting multiindex dataframes. I found the following solution by #Stein to be aesthetically pleasant. I've adapted his example to your data:
import pandas as pd
import matplotlib.pyplot as plt
from itertools import groupby
import numpy as np
%matplotlib inline
group = ('Group_A', 'Group_B')
subgroup = ('elem1', 'elem2', 'elem3', 'elem4')
obs = ('obs_1', 'obs_2')
index = pd.MultiIndex.from_tuples([('Group_A','elem1'),('Group_A','elem2'),('Group_A','elem3'),('Group_B','elem4')],
names=['group', 'subgroup'])
values = np.array([[4,0],[43,2],[0,10],[5,21]])
df = pd.DataFrame(index=index)
df['obs_1'] = values[:,0]
df['obs_2'] = values[:,1]
def add_line(ax, xpos, ypos):
line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
transform=ax.transAxes, color='gray')
line.set_clip_on(False)
ax.add_line(line)
def label_len(my_index,level):
labels = my_index.get_level_values(level)
return [(k, sum(1 for i in g)) for k,g in groupby(labels)]
def label_group_bar_table(ax, df):
ypos = -.1
scale = 1./df.index.size
for level in range(df.index.nlevels)[::-1]:
pos = 0
for label, rpos in label_len(df.index,level):
lxpos = (pos + .5 * rpos)*scale
ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
add_line(ax, pos*scale, ypos)
pos += rpos
add_line(ax, pos*scale , ypos)
ypos -= .1
ax = df.plot(kind='bar',stacked=False)
#Below 2 lines remove default labels
ax.set_xticklabels('')
ax.set_xlabel('')
label_group_bar_table(ax, df)
Which produces:
How to create a grouped bar chart of a hierarchical dataset with 2 levels
You can create a subplot for each group and stick them together with wspace=0. The width of each subplot must be corrected according to the number of subgroups by using the width_ratios argument in the gridspec_kw dictionary so that all the columns have the same width.
Then there are limitless formatting choices to make. In the following example, I choose to draw horizontal grid lines in the background and a separation line between the groups by using the minor tick marks.
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import matplotlib.pyplot as plt # v 3.3.2
# Create sample DataFrame with MultiIndex
df = pd.DataFrame(dict(group = ['GroupA', 'GroupA', 'GroupA', 'GroupB'],
subgroup = ['Elem1', 'Elem2', 'Elem3', 'Elem4'],
obs_1 = [4, 34, 0, 5],
obs_2 = [0, 2, 10, 21]))
df.set_index(['group', 'subgroup'], inplace=True)
# Create figure with a subplot for each group with a relative width that
# is proportional to the number of subgroups
groups = df.index.levels[0]
nplots = groups.size
plots_width_ratios = [df.xs(group).index.size for group in groups]
fig, axes = plt.subplots(nrows=1, ncols=nplots, sharey=True, figsize=(6, 4),
gridspec_kw = dict(width_ratios=plots_width_ratios, wspace=0))
# Loop through array of axes to create grouped bar chart for each group
alpha = 0.3 # used for grid lines, bottom spine and separation lines between groups
for group, ax in zip(groups, axes):
# Create bar chart with horizontal grid lines and no spines except bottom one
df.xs(group).plot.bar(ax=ax, legend=None, zorder=2)
ax.grid(axis='y', zorder=1, color='black', alpha=alpha)
for spine in ['top', 'left', 'right']:
ax.spines[spine].set_visible(False)
ax.spines['bottom'].set_alpha(alpha)
# Set and place x labels for groups
ax.set_xlabel(group)
ax.xaxis.set_label_coords(x=0.5, y=-0.15)
# Format major tick labels for subgroups
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha='center')
ax.tick_params(axis='both', which='major', length=0, pad=10)
# Set and format minor tick marks for separation lines between groups: note
# that except for the first subplot, only the right tick mark is drawn to avoid
# duplicate overlapping lines so that when an alpha different from 1 is chosen
# (like in this example) all the lines look the same
if ax.is_first_col():
ax.set_xticks([*ax.get_xlim()], minor=True)
else:
ax.set_xticks([ax.get_xlim()[1]], minor=True)
ax.tick_params(which='minor', length=45, width=0.8, color=[0, 0, 0, alpha])
# Add legend using the labels and handles from the last subplot
fig.legend(*ax.get_legend_handles_labels(), frameon=False,
bbox_to_anchor=(0.92, 0.5), loc="center left")
title = 'Grouped bar chart of a hierarchical dataset with 2 levels'
fig.suptitle(title, y=1.01, size=14);
Reference: this answer by gyx-hh

Categories

Resources