Seaborn swarmplot: Get point coordinates [duplicate] - python

I have the following data:
import pandas as pd
import numpy as np
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
and I plot it in seaborn using this code:
import seaborn as sb
sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
How do I get the x-span of each group? What is the range of the horizontal span of the swarmplot for the Parent group, for instance?

You can get the information from the collections which are created by swarmplot.
swarmplot actually returns the matplotlib Axes instance, and from there we can find the PathCollections that it creates. To get the positions, we can use .get_offsets().
Here is your example, modified to find and print the swarm limits, and then use them to plot a box around the swarms.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib.patches import Rectangle
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
ax = sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
def getdatalim(coll):
x,y = np.array(coll.get_offsets()).T
try:
print 'xmin={}, xmax={}, ymin={}, ymax={}'.format(
x.min(), x.max(), y.min(), y.max())
rect = Rectangle((x.min(),y.min()),x.ptp(),y.ptp(),edgecolor='k',facecolor='None',lw=3)
ax.add_patch(rect)
except ValueError:
pass
getdatalim(ax.collections[0]) # "Parent"
getdatalim(ax.collections[1]) # "Offspring"
plt.show()
which prints:
xmin=-0.107313729132, xmax=0.10661092707, ymin=-0.598534246847, ymax=0.980441247759
xmin=0.942829146473, xmax=1.06105941656, ymin=0.761277608688, ymax=1.74729717464
And here's the figure:

Related

Showing values on barplot

The following code generates a pdf file that is fed by a looping barplot over a data frame. My goal is to annotate values over the bars. I have already tried various times to monitor the values but failed. May I get any help on this? Thanks
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
d = {'index': ['index1', 'index1', 'index2', 'index2'], 'group': ['gr1', 'gr1','gr2','gr2'],
targetscore':[15,15,10,10], 'exam':['old','new','old','new'], 'score':[5,6,7,8]}
df = pd.DataFrame(data = d)
pp = PdfPages('mypath/extraction.pdf')
for i in range(len(df['group'])):
subdf = df[df['group'] == df.iloc[i,1]]
sns.catplot(y = 'score', x = 'group', data = subdf, hue = 'exam', kind = 'bar',
row = 'index', col = 'exam', col_order = ['old', 'new'], height = 3, aspect = 2)
plt.show
pp.savefig(plt.gcf())
pp.close()

Bar plots of data frame columns split by criterion of another column

I would like to create bar plots of specified data frame columns split by a criterion of another column (here < 5). It works like below but certainly there is a more dataframe-like way? Something like df.makeCoolBarPlots()?
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
df = pd.DataFrame({'score':[1,6,2,3,1,9,5], 'age':[10,16,33,12,8,24,18], 'IQ':[89,120,88,94,103,110,102]})
df_pass = df[df['score'] >= 5]
df_fail = df[df['score'] < 5]
fieldsOfInterest = ['age', 'IQ']
ind = np.arange(2)
for fieldOfInterest in fieldsOfInterest:
plt.figure()
plt.bar(ind, [df_pass[fieldOfInterest].mean(), df_fail[fieldOfInterest].mean()], yerr=[df_pass[fieldOfInterest].std(), df_fail[fieldOfInterest].std()])
stat, p = stats.ttest_ind(df_pass[fieldOfInterest], df_fail[fieldOfInterest])
plt.title("p={:0.3f}".format(p))
plt.xticks(ind, ('pass', 'fail'))
plt.ylabel(fieldOfInterest)
plt.show()
You can use pandas builtin plot function alongside groupby:
# First make your conditions using np.select
df["group"] = np.select([df["score"].ge(5), df["score"].lt(5)], ["pass", "fail"])
# Create a groupby
gb = df.groupby('group')
for col in ["age", "IQ"]:
# Get p value, mean, and std for each column
_, p = stats.ttest_ind(*[g[col] for n, g in gb])
means = gb[col].mean()
errors = gb[col].std()
# Plot using pandas.plot
fig, ax = plt.subplots()
means.plot.bar(yerr=errors, ax=ax)
ax.set(ylabel=col, title="p={:.3f}".format(p))
Results in:

Replace x-ticks with evenly spaced timestamps

I am trying to insert timestamps on the x-axis for a scatter plot instead of total seconds. Below is what I have tried thus far but I'm getting an error with this line;
loc, labels = ax.set_xticks(x)
AttributeError: 'NoneType' object has no attribute 'update'
Example:
import pandas as pd
import matplotlib.pyplot as plt
d = ({
'A' : ['08:00:00','08:10:00','08:12:00','08:26:00','08:29:00','08:31:00','10:10:00','10:25:00','10:29:00','10:31:00'],
'B' : ['1','1','1','2','2','2','7','7','7','7'],
'C' : ['X','Y','Z','X','Y','Z','A','X','Y','Z'],
})
df = pd.DataFrame(data=d)
fig,ax = plt.subplots()
x = df['A']
y = df['B']
x_numbers = (pd.to_timedelta(df['A']).dt.total_seconds())
ax.scatter(x_numbers, y)
loc, labels = ax.set_xticks(x)
newlabels = [str(pd.Timedelta(str(i)+ ' seconds')).split()[2] for i in loc]
ax.set_xticks(loc, newlabels)
Note
I need to use ax instead of plt as this plot is called as a subplot. If I use plot, the axis will be assigned to the last subplot instead of the designated one.
I would suggest to use datetimes directly without messing with the ticklabels. Using a matplotlib.dates.MinuteLocator in addition can give you nice positions of the ticks.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
d = ({
'A' : ['08:00:00','08:10:00','08:12:00','08:26:00','08:29:00','08:31:00',
'10:10:00','10:25:00','10:29:00','10:31:00'],
'B' : ['1','1','1','2','2','2','7','7','7','7'],
'C' : ['X','Y','Z','X','Y','Z','A','X','Y','Z'],
})
df = pd.DataFrame(data=d)
df['A'] = pd.to_datetime(df['A'])
fig,ax = plt.subplots()
ax.scatter(df["A"].values, df["B"].values)
ax.set_xlim(df["A"].min(), df["A"].max())
ax.xaxis.set_major_locator(mdates.MinuteLocator((0,30)))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))
plt.show()
I am taking a guess, but if you want to replace the x-axis labels give this a try.
import pandas as pd
import matplotlib.pyplot as plt
d = ({
'A' : ['08:00:00','08:10:00','08:12:00','08:26:00','08:29:00','08:31:00','10:10:00','10:25:00','10:29:00','10:31:00'],
'B' : ['1','1','1','2','2','2','7','7','7','7'],
'C' : ['X','Y','Z','X','Y','Z','A','X','Y','Z'],
})
df = pd.DataFrame(data=d)
x = df['A']
y = df['B']
x_numbers = (pd.to_timedelta(df['A']).dt.total_seconds())
fig,ax = plt.subplots(figsize=(10,7))
ax.scatter(x_numbers, y)
xLabel = [str(int(num)) + ' seconds' for num in x_numbers]
ax.set_xticklabels(xLabel)
plt.tight_layout()
plt.show()
Something like this will work:
Edit: Changes made to make sure axis and subplot is used
import pandas as pd
import matplotlib.pyplot as plt
d = ({
'A' : ['08:00:00','08:10:00','08:12:00','08:26:00','08:29:00','08:31:00','10:10:00','10:25:00','10:29:00','10:31:00'],
'B' : ['1','1','1','2','2','2','7','7','7','7'],
'C' : ['X','Y','Z','X','Y','Z','A','X','Y','Z'],
})
df = pd.DataFrame(data=d)
fig,ax = plt.subplots()
x = df['A']
y = df['B']
x_numbers = (pd.to_timedelta(df['A']).dt.total_seconds())
ax.scatter(x_numbers, y)
plt.sca(ax) # gets handle on the current axis
loc, labels = plt.xticks()
plt.xticks(loc, [str(a) for a in x])
plt.show()

Obtaining span of plotted points from seaborn swarmplot

I have the following data:
import pandas as pd
import numpy as np
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
and I plot it in seaborn using this code:
import seaborn as sb
sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
How do I get the x-span of each group? What is the range of the horizontal span of the swarmplot for the Parent group, for instance?
You can get the information from the collections which are created by swarmplot.
swarmplot actually returns the matplotlib Axes instance, and from there we can find the PathCollections that it creates. To get the positions, we can use .get_offsets().
Here is your example, modified to find and print the swarm limits, and then use them to plot a box around the swarms.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib.patches import Rectangle
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
ax = sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
def getdatalim(coll):
x,y = np.array(coll.get_offsets()).T
try:
print 'xmin={}, xmax={}, ymin={}, ymax={}'.format(
x.min(), x.max(), y.min(), y.max())
rect = Rectangle((x.min(),y.min()),x.ptp(),y.ptp(),edgecolor='k',facecolor='None',lw=3)
ax.add_patch(rect)
except ValueError:
pass
getdatalim(ax.collections[0]) # "Parent"
getdatalim(ax.collections[1]) # "Offspring"
plt.show()
which prints:
xmin=-0.107313729132, xmax=0.10661092707, ymin=-0.598534246847, ymax=0.980441247759
xmin=0.942829146473, xmax=1.06105941656, ymin=0.761277608688, ymax=1.74729717464
And here's the figure:

Pandas plot: Assign Colors

I have many data frames that I am plotting for a presentation. These all have different columns, but all contain the same additional column foobar. At the moment, I am plotting these different data frames using
df.plot(secondary_y='foobar')
Unfortunately, since these data frames all have different additional columns with different ordering, the color of foobar is always different. This makes the presentation slides unnecessary complicated. I would like, throughout the different plots, assign that foobar is plotted bold and black.
Looking at the docs, the only thing coming close appears to be the parameter colormap - I would need to ensure that the xth color in the color map is always black, where x is the order of foobar in the data frame. Seems to be more complicated than it should be, also this wouldn't make it bold.
Is there a (better) approach?
I would suggest using matplotlib directly rather than the dataframe plotting methods. If df.plot returned the artists it added instead of an Axes object it wouldn't be too bad to change the color of the line after it was plotted.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def pandas_plot(ax, df, callout_key):
"""
Parameters
----------
ax : mpl.Axes
The axes to draw to
df : DataFrame
Data to plot
callout_key : str
key to highlight
"""
artists = {}
x = df.index.values
for k, v in df.iteritems():
style_kwargs = {}
if k == callout_key:
style_kwargs['c'] = 'k'
style_kwargs['lw'] = 2
ln, = ax.plot(x, v.values, **style_kwargs)
artists[k] = ln
ax.legend()
ax.set_xlim(np.min(x), np.max(x))
return artists
Usage:
fig, ax = plt.subplots()
ax2 = ax.twinx()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'sin': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'sin': -np.sin(th)}, index=th)
pandas_plot(ax, df, 'sin')
pandas_plot(ax2, df2, 'sin')
Perhaps you could define a function which handles the special column in a separate plot call:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
Using code from tcaswell's example,
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
fig, ax = plt.subplots()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'foobar': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'foobar': -np.sin(th)}, index=th)
emphasize_plot(ax, df, 'foobar', lw=2, c='k')
emphasize_plot(ax, df2, 'foobar', lw=2, c='k')
plt.show()
yields
I used #unutbut's answer and extended it to allow for a secondary y axis and correct legends:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
ax2 = ax.twinx()
df[columns].plot(ax=ax)
df[col].plot(ax=ax2, **emphargs)
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)

Categories

Resources