I am trying to make a profile plot for two columns of a pandas.DataFrame. I would not expect this to be in pandas directly but it seems there is nothing in matplotlib either. I have searched around and cannot find it in any package other than rootpy. Before I take the time to write this myself I thought I would ask if there was a small package that contained profile histograms, perhaps where they are known by a different name.
If you don't know what I mean by "profile histogram" have a look at the ROOT implementation. http://root.cern.ch/root/html/TProfile.html
You can easily do it using scipy.stats.binned_statistic.
import scipy.stats
import numpy
import matplotlib.pyplot as plt
x = numpy.random.rand(10000)
y = x + scipy.stats.norm(0, 0.2).rvs(10000)
means_result = scipy.stats.binned_statistic(x, [y, y**2], bins=50, range=(0,1), statistic='mean')
means, means2 = means_result.statistic
standard_deviations = numpy.sqrt(means2 - means**2)
bin_edges = means_result.bin_edges
bin_centers = (bin_edges[:-1] + bin_edges[1:])/2.
plt.errorbar(x=bin_centers, y=means, yerr=standard_deviations, linestyle='none', marker='.')
Use seaborn. Data as from #MaxNoe
import numpy as np
import seaborn as sns
# just some random numbers to get started
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
sns.regplot(x=x, y=y, x_bins=10, fit_reg=None)
You can do much more (error bands are from bootstrap, you can change the estimator on the y-axis, add regression, ...)
While #Keith's answer seems to fit what you mean, it is quite a lot of code. I think this can be done much simpler, so one gets the key concepts and can adjust and build on top of it.
Let me stress one thing: what ROOT is calling a ProfileHistogram is not a special kind of plot. It is an errorbar plot. Which can simply be done in matplotlib.
It is a special kind of computation and that's not the task of a plotting library. This lies in the pandas realm, and pandas is great at stuff like this. It's symptomatical for ROOT as the giant monolithic pile it is to have an extra class for this.
So what you want to do is: discretize in some variable x and for each bin, calculate something in another variable y.
This can easily done using np.digitize together with the pandas groupy and aggregate methods.
Putting it all together:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# just some random numbers to get startet
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
df = pd.DataFrame({'x': x, 'y': y})
# calculate in which bin row belongs base on `x`
# bins needs the bin edges, so this will give as 100 equally sized bins
bins = np.linspace(-2, 2, 101)
df['bin'] = np.digitize(x, bins=bins)
bin_centers = 0.5 * (bins[:-1] + bins[1:])
bin_width = bins[1] - bins[0]
# grouby bin, so we can calculate stuff
binned = df.groupby('bin')
# calculate mean and standard error of the mean for y in each bin
result = binned['y'].agg(['mean', 'sem'])
result['x'] = bin_centers
result['xerr'] = bin_width / 2
# plot it
result.plot(
x='x',
y='mean',
xerr='xerr',
yerr='sem',
linestyle='none',
capsize=0,
color='black',
)
plt.savefig('result.png', dpi=300)
Just like ROOT ;)
I made a module myself for this functionality.
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
def Profile(x,y,nbins,xmin,xmax,ax):
df = DataFrame({'x' : x , 'y' : y})
binedges = xmin + ((xmax-xmin)/nbins) * np.arange(nbins+1)
df['bin'] = np.digitize(df['x'],binedges)
bincenters = xmin + ((xmax-xmin)/nbins)*np.arange(nbins) + ((xmax-xmin)/(2*nbins))
ProfileFrame = DataFrame({'bincenters' : bincenters, 'N' : df['bin'].value_counts(sort=False)},index=range(1,nbins+1))
bins = ProfileFrame.index.values
for bin in bins:
ProfileFrame.ix[bin,'ymean'] = df.ix[df['bin']==bin,'y'].mean()
ProfileFrame.ix[bin,'yStandDev'] = df.ix[df['bin']==bin,'y'].std()
ProfileFrame.ix[bin,'yMeanError'] = ProfileFrame.ix[bin,'yStandDev'] / np.sqrt(ProfileFrame.ix[bin,'N'])
ax.errorbar(ProfileFrame['bincenters'], ProfileFrame['ymean'], yerr=ProfileFrame['yMeanError'], xerr=(xmax-xmin)/(2*nbins), fmt=None)
return ax
def Profile_Matrix(frame):
#Much of this is stolen from https://github.com/pydata/pandas/blob/master/pandas/tools/plotting.py
import pandas.core.common as com
import pandas.tools.plotting as plots
from pandas.compat import lrange
from matplotlib.artist import setp
range_padding=0.05
df = frame._get_numeric_data()
n = df.columns.size
fig, axes = plots._subplots(nrows=n, ncols=n, squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
mask = com.notnull(df)
boundaries_list = []
for a in df.columns:
values = df[a].values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))
for i, a in zip(lrange(n), df.columns):
for j, b in zip(lrange(n), df.columns):
common = (mask[a] & mask[b]).values
nbins = 100
(xmin,xmax) = boundaries_list[i]
ax = axes[i, j]
Profile(df[a][common],df[b][common],nbins,xmin,xmax,ax)
ax.set_xlabel('')
ax.set_ylabel('')
plots._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
plots._label_axis(ax, kind='y', label=a, position='left')
if j!= 0:
ax.yaxis.set_visible(False)
if i != n-1:
ax.xaxis.set_visible(False)
for ax in axes.flat:
setp(ax.get_xticklabels(), fontsize=8)
setp(ax.get_yticklabels(), fontsize=8)
return axes
To my knowledge matplotlib doesn't still allow to directly produce profile histograms.
You can instead give a look at Hippodraw, a package developed at SLAC, that can be used as a Python extension module.
Here there is a Profile histogram example:
http://www.slac.stanford.edu/grp/ek/hippodraw/datareps_root.html#datareps_profilehist
Related
This code:
N = 1000
df = pd.DataFrame({
'distribution_1': np.random.randn(N),
'distribution_2': np.random.randn(N)
})
df['distribution_2'] = df['distribution_2'] / 2
df = df.melt(value_vars=['distribution_1', 'distribution_2'], value_name='value', var_name='distribution')
g = sns.ecdfplot(data=df, x='value', hue='distribution')
g.set(ylim=(-.1, 1.1))
yields a figure like this:
The CDFs of both distributions do not extend to the limits of the x-axis. I would like to know how to do this. In ggplot2 there is a boolean flag called pad that does this (see e.g. REF).
This is also possible in seaborn? (I was unable to find it ...)
sns.ecdfplot() doesn't seem to support such an option. As a workaround, you could loop through the generated lines, and move the endpoints. (Also note that sns.ecdfplot doesn't return a FacetGrid, but a subplot. Naming the return value g might be a bit confusing when comparing to code in example and tutorials.)
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
N = 1000
df = pd.DataFrame({'distribution_1': np.random.randn(N),
'distribution_2': np.random.randn(N) / 2})
df = df.melt(value_vars=['distribution_1', 'distribution_2'], value_name='value', var_name='distribution')
ax = sns.ecdfplot(data=df, x='value', hue='distribution', palette='winter')
ax.set_ylim(-.1, 1.1)
xmin, xmax = ax.get_xlim()
for line in ax.lines:
x = line.get_xdata()
x[0] = xmin
x[-1] = xmax
line.set_xdata(x)
plt.show()
In Have gradient colours in sns.pairplot for one column of dataframe so that I can see which datapoints are connected to each other
very good answers were given how to solve the challenge to recognize which data points are related to the same data points in other sub plots.
To have a self containing question, I state here my requirement (which is somehow an extension of the linked question):
I would like to see the interdependence of my data.
For that I want to have a gradual color gradient for one column of my DataFrame (so
that low numerical values of that column are e.g. yellow and high values are blue).
For a second column of my data, I would like to have increasing marker sizes with
increasing values of this column.
These colors and marker sizes should be visible for all non diagonal subplots of my
plot, based on the data points of a and b.
The solution to the gradient color is given in the linked question. I put here both solutions that presently exist:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
f, axes = plt.subplots(1, 1)
np.random.seed(1)
a = np.arange(0, 10, 0.1)
def myFunc(x):
myReturn = +10 + 10*x -x**2 + 1*np.random.random(x.shape[0])
return myReturn
b = myFunc(a)
c = a * np.sin(a)
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
if False:
sns.pairplot(
df,
corner=True,
diag_kws=dict(color=".6"),
plot_kws=dict(
hue=df.index,
palette="blend:gold,dodgerblue",
),
)
else:
from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list('blue-yellow', ['gold', 'lightblue', 'darkblue']) # plt.get_cmap('viridis_r')
g = sns.pairplot(df, corner=True)
for ax in g.axes.flat:
if ax is not None and not ax in g.diag_axes:
for collection in ax.collections:
collection.set_cmap(cmap)
collection.set_array(df['a'])
plt.show()
A (basic) solution for the increasing marker sizes would be (using simply matplotlib):
import numpy as np
import matplotlib.pyplot as plt
# Fixing random state for reproducibility
np.random.seed(19680801)
N = 50
x = np.random.rand(N)
y = np.random.rand(N)
colors = np.random.rand(N)
area = (30 * np.random.rand(N))**2 # 0 to 15 point radii
plt.scatter(x, y, s=area, c=colors, alpha=0.5)
plt.show()
My question is:
I could work on a manual solution to iterate over all columns of my DataFrame and build the sub plots by myself. Is there any more convenient (and probably more robust) way to do this?
You can modify the sizes and hue for the off-diagonal data easily by adding the parameters you'd use in Matplotlib to the plot_kws dictionary:
sns.pairplot(df, corner=True,
diag_kws=dict(color=".6"),
plot_kws=dict(
hue=df['a'],
palette="blend:gold,dodgerblue",
size = df['b']
)
)
While trying to loop over equations on matplotlib, I only get the last text from plt.text(). How can I iterate over matplotlib figures and annotate equation for each plot? Also the plt.savefig() function does not save the figures.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr
df=pd.read_csv(r'C:\GISN21\Ex_04\data5\data3_4.txt',sep="\t",header=0)
df2=df.loc[:, df.columns != 'Station']
def calculate_pvalues(df):
df = df.dropna()._get_numeric_data()
dfcols = pd.DataFrame(columns=df.columns)
pvalues = dfcols.transpose().join(dfcols, how='outer')
for r in df.columns:
for c in df.columns:
pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 8)
if pvalues[r][c] < 0.05:
i=0
if r != c:
#best fit line
(m,b)=np.polyfit(df[r] ,df[c] ,1)
equation = 'y = ' + str(round(m,4)) + 'x' ' + ' + str(round(b,4))
f = plt.figure()
plt.text(0.5,0.5, equation)
plt.savefig("correlation{i}.png".format(i=i))
ax = f.add_subplot(1,1,1)
p = sns.regplot(x=df[r],y=df[c],data=df,ax=ax)
return pvalues
sns.pairplot(df,kind='scatter')
plt.savefig('correlation.png')
plt.show()
By default, plt.text uses data coordinates, and matplotlib doesn't automatically adjust the limits to include text that is not within the data limits. So unless (0.5, 0.5) is within the limits when plt.text is called it will not be visible. If this is the case, you can resolve the issue by using axis coordinates. This can be done by supplying the axis transformation to the transform keyword, i.e.
plt.text(0.5, 0.5, equation, transform = plt.gca().transAxes)
Will place the Text instance in the center of the current axes.
This example is specifically relating to plotting data as a function of log(redshift+1) and having a reference redshift axis but can be easily generalised to any functional modification.
I've written a neat little function (with the help of some question/answers on here) that allows me to easily add a redshift axis to the top of a log(1+redshift) plot. I am really struggling to get meaningful minor ticks (and would rather not share my dismal efforts!).
Here is the code, including example plot:
In this case, I would like redshifts at every 0.1 increment not occupied by a major tick, with the flexibility of changing that 0.1 in the function call.
import matplotlib.pyplot as plt
import numpy as np
def add_zaxis(axis,denomination):
oldx = axis.get_xlim()
axis.set_xlim(0., None)
zspan = [(10**x)-1 for x in axis.get_xlim()]
denom = denomination
zmax = int(np.floor(zspan[1]/denom))*denom
zspan[1] = zmax
k = len(np.arange(zspan[0],zspan[1],denom))+1
zs = np.linspace(zspan[0],zspan[1],k)
z_ticks = [np.log10(1+x) for x in zs]
axz = axis.twiny()
axz.set_xticks(z_ticks)
axz.set_xticklabels(['{:g}'.format(y) for y in zs])
axz.set_xlim(oldx)
axis.set_xlim(oldx)
return axz
data = np.random.randn(500)
data = data[data>0.]
fig, ax = plt.subplots(1)
plt.hist(np.log10(data+1), bins=22)
ax.set_xlabel('log(z+1)')
ax.minorticks_on()
axz = add_zaxis(ax,.3)
axz.set_xlabel('z')
axz.minorticks_on()
The idea would be to use a FixedLocator to position the ticks on the axis. You may then have one FixedLocator for the major ticks and one for the minor ticks.
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
def add_zaxis(ax,d=0.3, dminor=0.1):
f = lambda x: np.log10(x+1)
invf = lambda x: 10.0**x - 1.
xlim = ax.get_xlim()
zlim = [invf(x) for x in xlim]
axz = ax.twiny()
axz.set_xlim(xlim)
zs = np.arange(0,zlim[1],d)
zpos = f(zs)
axz.xaxis.set_major_locator(matplotlib.ticker.FixedLocator(zpos))
axz.xaxis.set_major_formatter(matplotlib.ticker.FixedFormatter(zs))
zsminor = np.arange(0,zlim[1],dminor)
zposminor = f(zsminor)
axz.xaxis.set_minor_locator(matplotlib.ticker.FixedLocator(zposminor))
axz.tick_params(axis='x',which='minor',bottom='off', top="on")
axz.set_xlabel('z')
data = np.random.randn(400)
data = data[data>0.]
fig, ax = plt.subplots(1)
plt.hist(np.log10(data+1), bins=22)
ax.set_xlabel('log(z+1)')
add_zaxis(ax)
ax.minorticks_on()
ax.tick_params(axis='x',which='minor',bottom='on', top="off")
plt.show()
Beginner user on the forum. Help please. I have a data set: x, y coordinates, each x, y has a value. I want to plot a 2d histogram displaying the sum of the values in each bin with color scale. matplotlib hexbin is straight forward. I can do this. eg:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LogNorm
xpos = np.random.rand(0,10)
ypos = np.random.rand(0,10)
plt.hexbin(x = xpos, y = ypos, C=mass, cmap= plt.cm.jet, gridsize=100, reduce_C_function=sum, bins="log")
cb = plt.colorbar()
cb.ax.set_ylabel('log (sum value in each bin)')
plt.xlabel('Xpos')
plt.ylabel('Ypos')
plt.show()
However, I'm struggling to make a similar plot with histogram2d or matplotlib hist2d. I think i have to combine binned_statistic_2d and histogram2d somehow. No problem if I replace plt.hexbin line above to this:
plt.hist2d(x = xpos, y = ypos, bins = 50, norm = LogNorm())
Any clue? I have look on the forum but can't seem to find a working code.
You could calculate the values to show in the binned 2D plot prior to plotting and then show as an imshow plot.
If you're happy to use pandas, one option would be to group the mass data accordings to cut (pandas.cut) x and y data. Then apply the sum (.sum()) and unstack to obtain a pivot table.
df.mass.groupby([pd.cut(df.x, bins=xbins, include_lowest=True),
pd.cut(df.y, bins=ybins, include_lowest=True)]) \
.sum().unstack(fill_value=0)
Here is a complete example:
import numpy as np; np.random.seed(1)
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors
xpos = np.random.randint(0,10, size=50)
ypos = np.random.randint(0,10, size=50)
mass = np.random.randint(0,75, size=50)
df = pd.DataFrame({"x":xpos, "y":ypos, "mass":mass})
xbins = range(10)
ybins = range(10)
su = df.mass.groupby([pd.cut(df.x, bins=xbins, include_lowest=True),
pd.cut(df.y, bins=ybins, include_lowest=True)]) \
.sum().unstack(fill_value=0)
print su
im = plt.imshow(su.values, norm=matplotlib.colors.LogNorm(1,300))
plt.xticks(range(len(su.index)), su.index, rotation=90)
plt.yticks(range(len(su.columns)), su.columns)
plt.colorbar(im)
plt.show()