Pandas: multiple bar plot from aggregated columns - python

In python pandas I have create a dataframe with one value for each year and two subclasses - i.e., one metric for a parameter triplet
import pandas, requests, numpy
import matplotlib.pyplot as plt
df
Metric Tag_1 Tag_2 year
0 5770832 FOOBAR1 name1 2008
1 7526436 FOOBAR1 xyz 2008
2 33972652 FOOBAR1 name1 2009
3 17491416 FOOBAR1 xyz 2009
...
16 6602920 baznar2 name1 2008
17 6608 baznar2 xyz 2008
...
30 142102944 baznar2 name1 2015
31 0 baznar2 xyz 2015
I would like to produce a bar plot with metrics as y-values over x=(year,Tag_1,Tag_2) and sorting primarily for years and secondly for tag_1 and color the bars depending on tag_1. Something like
(2008,FOOBAR,name1) --> 5770832 *RED*
(2008,baznar2,name1) --> 6602920 *BLUE*
(2008,FOOBAR,xyz) --> 7526436 *RED*
(2008,baznar2,xyz) --> ... *BLUE*
(2008,FOOBAR,name1) --> ... *RED*
I tried starting with a grouping of columns like
df.plot.bar(x=['year','tag_1','tag_2']
but have not found a way to separate selections into two bar sets next to each other.

This should get you on your way:
df = pd.read_csv('path_to_file.csv')
# Group by the desired columns
new_df = df.groupby(['year', 'Tag_1', 'Tag_2']).sum()
# Sort descending
new_df.sort('Metric', inplace=True)
# Helper function for generation sequence of 'r' 'b' colors
def get_color(i):
if i%2 == 0:
return 'r'
else:
return 'b'
colors = [get_color(j) for j in range(new_df.shape[0])]
# Make the plot
fig, ax = plt.subplots()
ind = np.arange(new_df.shape[0])
width = 0.65
a = ax.barh(ind, new_df.Metric, width, color = colors) # plot a vals
ax.set_yticks(ind + width) # position axis ticks
ax.set_yticklabels(new_df.index.values) # set them to the names
fig.tight_layout()
plt.show()

you can also do it this way:
fig, ax = plt.subplots()
df.groupby(['year', 'Tag_1', 'Tag_2']).sum().plot.barh(color=['r','b'], ax=ax)
fig.tight_layout()
plt.show()
PS if don't like scientific notation you can get rid of it:
ax.get_xaxis().get_major_formatter().set_scientific(False)

Related

How do I plot stacked barplots side by side in python? (preferentially seaborn)

I'm looking a way to plot side by side stacked barplots to compare host composition of positive (Condition==True) and total cases in each country from my dataframe.
Here is a sample of the DataFrame.
id Location Host genus_name #ofGenes Condition
1 Netherlands Homo sapiens Escherichia 4.0 True
2 Missing Missing Klebsiella 3.0 True
3 Missing Missing Aeromonas 2.0 True
4 Missing Missing Glaciecola 2.0 True
5 Antarctica Missing Alteromonas 2.0 True
6 Indian Ocean Missing Epibacterium 2.0 True
7 Missing Missing Klebsiella 2.0 True
8 China Homo sapiens Escherichia 0 False
9 Missing Missing Escherichia 2.0 True
10 China Plantae kingdom Pantoea 0 False
11 China Missing Escherichia 2.0 True
12 Pacific Ocean Missing Halomonas 0 False
I need something similar to the image bellow, but I want to plot in percentage.
Can anyone help me?
I guess what you want is a stacked categorical bar plot, which cannot be directly plotted using seaborn. But you can achieve it by customizing one.
Import some necessary packages.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
Read the dataset. Considering your sample data is too small, I randomly generate some to make the plot looks good.
def gen_fake_data(data, size=400):
unique_values = []
for c in data.columns:
unique_values.append(data[c].unique())
new_data = pd.DataFrame({c: np.random.choice(unique_values[i], size=size)
for i, c in enumerate(data.columns)})
new_data = pd.concat([data, new_data])
new_data['id'] = new_data.index + 1
return new_data
data = pd.read_csv('data.csv')
new_data = gen_fake_data(data)
Define the stacked categorical bar plot
def stack_catplot(x, y, cat, stack, data, palette=sns.color_palette('Reds')):
ax = plt.gca()
# pivot the data based on categories and stacks
df = data.pivot_table(values=y, index=[cat, x], columns=stack,
dropna=False, aggfunc='sum').fillna(0)
ncat = data[cat].nunique()
nx = data[x].nunique()
nstack = data[stack].nunique()
range_x = np.arange(nx)
width = 0.8 / ncat # width of each bar
for i, c in enumerate(data[cat].unique()):
# iterate over categories, i.e., Conditions
# calculate the location of each bar
loc_x = (0.5 + i - ncat / 2) * width + range_x
bottom = 0
for j, s in enumerate(data[stack].unique()):
# iterate over stacks, i.e., Hosts
# obtain the height of each stack of a bar
height = df.loc[c][s].values
# plot the bar, you can customize the color yourself
ax.bar(x=loc_x, height=height, bottom=bottom, width=width,
color=palette[j + i * nstack], zorder=10)
# change the bottom attribute to achieve a stacked barplot
bottom += height
# make xlabel
ax.set_xticks(range_x)
ax.set_xticklabels(data[x].unique(), rotation=45)
ax.set_ylabel(y)
# make legend
plt.legend([Patch(facecolor=palette[i]) for i in range(ncat * nstack)],
[f"{c}: {s}" for c in data[cat].unique() for s in data[stack].unique()],
bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.grid()
plt.show()
Let's plot!
plt.figure(figsize=(6, 3), dpi=300)
stack_catplot(x='Location', y='#ofGenes', cat='Condition', stack='Host', data=new_data)
If you want to plot in percentile, calculate it in the raw dataset.
total_genes = new_data.groupby(['Location', 'Condition'], as_index=False)['#ofGenes'].sum().rename(
columns={'#ofGenes': 'TotalGenes'})
new_data = new_data.merge(total_genes, how='left')
new_data['%ofGenes'] = new_data['#ofGenes'] / new_data['TotalGenes'] * 100
plt.figure(figsize=(6, 3), dpi=300)
stack_catplot(x='Location', y='%ofGenes', cat='Condition', stack='Host', data=new_data)
You didn't specify how you would like to stack the bars, but you should be able to do something like this...
df = pd.read_csv('data.csv')
agg_df = df.pivot_table(index='Location', columns='Host', values='Condition', aggfunc='count')
agg_df.plot(kind='bar', stacked=True)

Seaborn: Hue dependent on two values

I have following two dataframes that I would like to plot together. The first one (data) contains the complete data of different groups for several repeated experiments (=replicates) with the values for the individual cells within that experiment. The second one (avgs) summarizes the mean of each replicate experiment for all groups. I basically want to plot my data in the way suggested here.
data.head()
cell replicate value group
0 1 1 0.029723 GROUP_A
1 1 2 0.019136 GROUP_A
2 2 2 0.020216 GROUP_A
3 3 1 0.032020 GROUP_B
4 3 2 0.044815 GROUP_B
avgs.head()
replicate value group
0 1 0.019709 GROUP_A
1 2 0.018937 GROUP_A
2 1 0.358437 GROUP_B
3 2 0.269602 GROUP_B
4 3 0.303252 GROUP_B
My aim is to achieve either the plots shown in B or C, where the hue depends on both the group and replicate.
import matplotlib.pyplot as plt
import seaborn as sns
sns.swarmplot(x="group", y="value", data=data, hue="replicate")
sns.swarmplot(x="group", y="value", data=avgs,size=8,hue="replicate", edgecolor="k", linewidth=2)
will give me basically the plot shown in A, with the hue corresponding to the replicate.
Is there a way to do this either with a different color palette for each group, so that the each group have different colors with each replicate having different shades of that color (example B, made in Affinity Designer)?
An alternative that would work for me is to plot the single cell values of data with a grey palette. However how can I achieve that when I add the replicate mean data of avgs, each group has a different color and each replicate mean has the corresponding shading in that color (example C)?
Is there the possibility to pass a palette dictionary to seaborn/matplotlib e.g. something like:
gray = sns.dark_palette("gray", n_colors=5)
red = sns.dark_palette("red", n_colors=5)
blue = sns.dark_palette("blue", n_colors=5)
my_palette={"GROUP_A": gray, "GROUP_B": red, "GROUP_C": blue}
Thanks!
The groups can be plotted separately, each with its own palette. To make sure the x-positions are respected, the order= keyword needs to be set with all the desired x-labels.
Seaborn automatically adds legend entries for each call, so the legend can get very large. You can either suppress the legend, or limit it to the first few entries.
from matplotlib import pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
N = 500
data = pd.DataFrame({'replicate': np.random.choice(range(1, 4), N),
'value': 2 + np.random.uniform(-0.5, 0.5, (N, 5)).sum(axis=1),
'group': np.random.choice([f'GROUP_{g}' for g in 'ABCD'], N)})
groups = np.unique(data.group)
for g in groups:
data.loc[data.group == g, 'value'] += np.random.uniform(0, 3)
avgs = data.groupby(['replicate', 'group']).mean()
avgs.reset_index(inplace=True)
my_palette = {"GROUP_A": 'Greys', "GROUP_B": 'Reds', "GROUP_C": 'Blues', "GROUP_D": 'Greens'}
for ind, g in enumerate(groups):
sns.swarmplot(x="group", y="value", data=data[data.group == g], order=groups,
palette=my_palette[g], hue="replicate")
sns.swarmplot(x="group", y="value", data=avgs[avgs.group == g], order=groups,
size=8, palette=my_palette[g], hue="replicate", edgecolor="k", linewidth=2)
# plt.gca().legend_.remove() # optionally suppress the legend
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles=handles[:3], title='replicate')
plt.tight_layout()
plt.show()

Labeling a Bar Graph Created from a Grouped Pandas DataFrame where there's a NaN Category

I create a nice and tidy grouped data frame and then I use that data in a simple seaborn barplot. However, when I try to add labels to the bars, I get the following error:
ValueError: cannot convert float NaN to integer
I know this is because there is only one value (instead of two) for one of the grouped categories. How do I get it to label it "0"?
I've gone down the rabbit hole on this for a full day and haven't found anything. Here are the things that I've tried (in many different ways):
Inserting a row into the grouped dataframe.
Using pd.fillna().
Creating a function to apply within the labeling clause.
I work with a lot of data that frequently encounters this sort of problem, so I would really appreciate some help in solving this. It seems so simple. What am I missing? Thanks!
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# my initial data set
d = {'year' : [2014,2014,2014,2015,2015,],
'status' : ["n","y","n","n","n"],
'num' : [1,1,1,1,1]}
df = pd.DataFrame(d)
# groupby to create another dataframe
df2 = (df["status"]
.groupby(df["year"])
.value_counts(normalize=True)
.rename("Percent")
.apply(lambda x: x*100)
.reset_index())
# create my bar plot
f = plt.figure(figsize = (11,8.5))
ax1 = plt.subplot(2,2,1)
sns.barplot(x="year",
y="Percent",
hue="status",
hue_order = ["n","y"],
data=df2,
ci = None)
# label the bars
for p in ax1.patches:
ax1.text(p.get_x() + p.get_width()/2., p.get_height(), '%d%%' % round(p.get_height()),
fontsize=10, color='red', ha='center', va='bottom')
plt.show()
You could handle the empty-bar case by setting the height to zero if p.get_height() returns NaN:
for p in ax1.patches:
height = p.get_height()
if np.isnan(height):
height = 0
ax1.text(p.get_x() + p.get_width()/2., height, '%d%%' % round(height),
fontsize=10, color='red', ha='center', va='bottom')
gives me
Alternatively, you could expand your frame to ensure there's a zero there:
non_data_cols = df2.columns.drop("Percent")
full_index = pd.MultiIndex.from_product([df[col].unique() for col in non_data_cols], names=non_data_cols)
df2 = df2.set_index(non_data_cols.tolist()).reindex(full_index).fillna(0).reset_index()
which expands to give me
In [74]: df2
Out[74]:
year status Percent
0 2014 n 66.666667
1 2014 y 33.333333
2 2015 n 100.000000
3 2015 y 0.000000
When dealing with data where you have missing categories, a common trick that can be employed is stacking and unstacking the data. The general idea can be viewed in this answer. Once the data is formatted, you are able to fillna with your fill value (in this case 0), and leave your code as is.
All you have to do is replace your current creation of df2 with the below code.
df2 = (df.groupby('year').status.value_counts(normalize=True).mul(100)
.unstack().stack(dropna=False).fillna(0)
.rename('Percent').reset_index())
Which gives us:
year status Percent
0 2014 n 66.666667
1 2014 y 33.333333
2 2015 n 100.000000
3 2015 y 0.000000
Now, with no changes to your plotting code, I get this output:

Scatter plot in a loop only plots the first iteration

I am trying to plot several different things in scatter plots by having several subplots and iterating over the remaining categories, but the plots only display the first iteration without throwing any error. To clarify, here is an example of what the data actually look like:
a kind state property T
0 0.905618 I dry prop1 10
1 0.050311 I wet prop1 20
2 0.933696 II dry prop1 30
3 0.114824 III wet prop1 40
4 0.942719 IV dry prop1 50
5 0.276627 II wet prop2 10
6 0.612303 III dry prop2 20
7 0.803451 IV wet prop2 30
8 0.257816 II dry prop2 40
9 0.122468 IV wet prop2 50
And this is how I generated the example:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
kinds = ['I','II','III','IV']
states = ['dry','wet']
props = ['prop1','prop2']
T = [10,20,30,40,50]
a = np.random.rand(10)
k = ['I','I','II','III','IV','II','III','IV','II','IV']
s = ['dry','wet','dry','wet','dry','wet','dry','wet','dry','wet']
p = ['prop1','prop1','prop1','prop1','prop1','prop2','prop2','prop2','prop2','prop2']
t = [10,20,30,40,50,10,20,30,40,50]
df = pd.DataFrame(index=range(10),columns=['a','kind','state','property','T'])
df['a']=a
df['kind']=k
df['state']=s
df['property']=p
df['T']=t
print df
Next, I am going to generate 2 rows and 2 columns of subplots, to display variabilities in property1 and property2 in wet and dry states. So I basically slice my dataframe into several smaller ones like this:
first = df[(df['state']=='dry')&(df['property']=='prop1')]
second = df[(df['state']=='wet')&(df['property']=='prop1')]
third = df[(df['state']=='dry')&(df['property']=='prop2')]
fourth = df[(df['state']=='wet')&(df['property']=='prop2')]
dfs = [first,second,third,fourth]
in each of these subplots, which specify certain laboratory conditions, I want to plot the values of a versus T for different kinds of samples. To distinguish between the kinds of samples, I assign different colours and markers to them. So here is my plotting script:
fig = plt.figure(figsize=(8,8.5))
gs = gridspec.GridSpec(2,2, hspace=0.4, wspace=0.3)
colours = ['r','b','g','gold']
symbols = ['v','v','^','^']
titles=['dry 1','wet 1','dry 2','wet 2']
for no, df in enumerate(dfs):
ax = fig.add_subplot(gs[no])
for i, r in enumerate(kinds):
#print i, r
df = df[df['kind']==r]
c = colours[i]
m = symbols[i]
plt.scatter(df['T'],df['a'],c=c,s=50.0, marker=m, edgecolor='k')
ax = plt.xlabel('T')
ax = plt.xticks(T)
ax = plt.ylabel('A')
ax = plt.title(titles[no],fontsize=12,alpha=0.75)
plt.show()
But the result only plots the first iteration, in this case kind I in red triangles. If I remove this first item from the iterating lists, it only plots the first variable (kind II in blue triangles).
What am I doing wrong?
The figure looks like this, but I would like to have each subplot accordingly populated with red and blue and green and gold markers.
(Please note this happens with my real data as well, so the problem should not be in the way I generate the example.)
Your problem lies within the inner for loop. By writing df = df[df['kind']==r], you replace the original df with the version filtered for I. Then, in the next iteration of the loop, where you would filter for II, no further data is found. Therefore you also get no error message, as the code is otherwise 'correct'. If you rewrite the relevant piece of code like this:
for no, df in enumerate(dfs):
ax = fig.add_subplot(gs[no])
for i, r in enumerate(kinds):
#print i, r
df2 = df[df['kind']==r]
c = colours[i]
m = symbols[i]
plt.scatter(df2['T'],df2['a'],c=c,s=50.0, marker=m, edgecolor='k')
ax = plt.xlabel('T')
ax = plt.xticks(T)
ax = plt.ylabel('A')
ax = plt.title(titles[no],fontsize=12,alpha=0.75)
It should work just fine. Tested on Python 3.5.

How to make multiline graph with matplotlib subplots and pandas?

I'm fairly new at coding (completely self taught), and have started using it at at my job as a research assistant in a cancer lab. I need some help setting up a few line graphs in matplot lab.
I have a dataset that includes nextgen sequencing data for about 80 patients. on each patient, we have different timepoints of analysis, different genes detected (out of 40), and the associated %mutation for the gene.
My goal is to write two scripts, one that will generate a "by patient" plot, that will be a linegraph with y-%mutation, x-time of measurement, and will have a different color line for all lines made by each of the patient's associated genes. The second plot will be a "by gene", where I will have one plot contain different color lines that represent each of the different patient's x/y values for that specific gene.
Here is an example dataframe for 1 genenumber for the above script:
gene yaxis xaxis pt# gene#
ASXL1-3 34 1 3 1
ASXL1-3 0 98 3 1
IDH1-3 24 1 3 11
IDH1-3 0 98 3 11
RUNX1-3 38 1 3 21
RUNX1-3 0 98 3 21
U2AF1-3 33 1 3 26
U2AF1-3 0 98 3 26
I have setup a groupby script that when I iterate over it, gives me a dataframe for every gene-timepoint for each patient.
grouped = df.groupby('pt #')
for groupObject in grouped:
group = groupObject[1]
For patient 1, this gives the following output:
y x gene patientnumber patientgene genenumber dxtotransplant \
0 40.0 1712 ASXL1 1 ASXL1-1 1 1857
1 26.0 1835 ASXL1 1 ASXL1-1 1 1857
302 7.0 1835 RUNX1 1 RUNX1-1 21 1857
I need help writing a script that will create either of the plots described above. using the bypatient example, my general idea is that I need to create a different subplot for every gene a patient has, where each subplot is the line graph represented by that one gene.
Using matplotlib this is about as far as I have gotten:
plt.figure()
grouped = df.groupby('patient number')
for groupObject in grouped:
group = groupObject[1]
df = group #may need to remove this
for element in range(len(group)):
xs = np.array(df[df.columns[1]]) #"x" column
ys= np.array(df[df.columns[0]]) #"y" column
gene = np.array(df[df.columns[2]])[element] #"gene" column
plt.subplot(1,1,1)
plt.scatter(xs,ys, label=gene)
plt.plot(xs,ys, label=gene)
plt.legend()
plt.show()
This produces the following output:
In this output, the circled line is not supposed to be connected to the other 2 points. In this case, this is patient 1, who has the following datapoint:
x y gene
1712 40 ASXL1
1835 26 ASXL1
1835 7 RUNX1
Using seaborn I have gotten close to my desired graph using this code:
grouped = df.groupby(['patientnumber'])
for groupObject in grouped:
group = groupObject[1]
g = sns.FacetGrid(group, col="patientgene", col_wrap=4, size=4, ylim=(0,100))
g = g.map(plt.scatter, "x", "y", alpha=0.5)
g = g.map(plt.plot, "x", "y", alpha=0.5)
plt.title= "gene:%s"%element
Using this code I get the following:
If I adjust the line:
g = sns.FacetGrid(group, col="patientnumber", col_wrap=4, size=4, ylim=(0,100))
I get the following result:
As you can see in the 2d example, the plot is treating every point on my plot as if they are from the same line (but they are actually 4 separate lines).
How I can tweak my iterations so that each patient-gene is treated as a separate line on the same graph?
I wrote a subplot function that may give you a hand. I modified the data a tad to help illustrate the plotting functionality.
gene,yaxis,xaxis,pt #,gene #
ASXL1-3,34,1,3,1
ASXL1-3,3,98,3,1
IDH1-3,24,1,3,11
IDH1-3,7,98,3,11
RUNX1-3,38,1,3,21
RUNX1-3,2,98,3,21
U2AF1-3,33,1,3,26
U2AF1-3,0,98,3,26
ASXL1-3,39,1,4,1
ASXL1-3,8,62,4,1
ASXL1-3,0,119,4,1
IDH1-3,27,1,4,11
IDH1-3,12,62,4,11
IDH1-3,1,119,4,11
RUNX1-3,42,1,4,21
RUNX1-3,3,62,4,21
RUNX1-3,1,119,4,21
U2AF1-3,16,1,4,26
U2AF1-3,1,62,4,26
U2AF1-3,0,119,4,26
This is the subplotting function...with some extra bells and whistles :)
def plotByGroup(df, group, xCol, yCol, title = "", xLabel = "", yLabel = "", lineColors = ["red", "orange", "yellow", "green", "blue", "purple"], lineWidth = 2, lineOpacity = 0.7, plotStyle = 'ggplot', showLegend = False):
"""
Plot multiple lines from a Pandas Data Frame for each group using DataFrame.groupby() and MatPlotLib PyPlot.
#params
df - Required - Data Frame - Pandas Data Frame
group - Required - String - Column name to group on
xCol - Required - String - Column name for X axis data
yCol - Required - String - Column name for y axis data
title - Optional - String - Plot Title
xLabel - Optional - String - X axis label
yLabel - Optional - String - Y axis label
lineColors - Optional - List - Colors to plot multiple lines
lineWidth - Optional - Integer - Width of lines to plot
lineOpacity - Optional - Float - Alpha of lines to plot
plotStyle - Optional - String - MatPlotLib plot style
showLegend - Optional - Boolean - Show legend
#return
MatPlotLib Plot Object
"""
# Import MatPlotLib Plotting Function & Set Style
from matplotlib import pyplot as plt
matplotlib.style.use(plotStyle)
figure = plt.figure() # Initialize Figure
grouped = df.groupby(group) # Set Group
i = 0 # Set iteration to determine line color indexing
for idx, grp in grouped:
colorIndex = i % len(lineColors) # Define line color index
lineLabel = grp[group].values[0] # Get a group label from first position
xValues = grp[xCol] # Get x vector
yValues = grp[yCol] # Get y vector
plt.subplot(1,1,1) # Initialize subplot and plot (on next line)
plt.plot(xValues, yValues, label = lineLabel, color = lineColors[colorIndex], lw = lineWidth, alpha = lineOpacity)
# Plot legend
if showLegend:
plt.legend()
i += 1
# Set title & Labels
axis = figure.add_subplot(1,1,1)
axis.set_title(title)
axis.set_xlabel(xLabel)
axis.set_ylabel(yLabel)
# Return plot for saving, showing, etc.
return plt
And to use it...
import pandas
# Load the Data into Pandas
df = pandas.read_csv('data.csv')
#
# Plotting - by Patient
#
# Create Patient Grouping
patientGroup = df.groupby('pt #')
# Iterate Over Groups
for idx, patientDF in patientGroup:
# Let's give them specific titles
plotTitle = "Gene Frequency over Time by Gene (Patient %s)" % str(patientDf['pt #'].values[0])
# Call the subplot function
plot = plotByGroup(patientDf, 'gene', 'xaxis', 'yaxis', title = plotTitle, xLabel = "Days", yLabel = "Gene Frequency")
# Add Vertical Lines at Assay Timepoints
timepoints = set(patientDf.xaxis.values)
[plot.axvline(x = timepoint, linewidth = 1, linestyle = "dashed", color='gray', alpha = 0.4) for timepoint in timepoints]
# Let's see it
plot.show()
And of course, we can do the same by gene.
#
# Plotting - by Gene
#
# Create Gene Grouping
geneGroup = df.groupby('gene')
# Generate Plots for Groups
for idx, geneDF in geneGroup:
plotTitle = "%s Gene Frequency over Time by Patient" % str(geneDf['gene'].values[0])
plot = plotByGroup(geneDf, 'pt #', 'xaxis', 'yaxis', title = plotTitle, xLab = "Days", yLab = "Frequency")
plot.show()
If this isn't what you're looking for, provide a clarification and I'll take another crack at it.

Categories

Resources