How to plot a 3D density map in python with matplotlib - python

I have a large dataset of (x,y,z) protein positions and would like to plot areas of high occupancy as a heatmap. Ideally the output should look similiar to the volumetric visualisation below, but I'm not sure how to achieve this with matplotlib.
My initial idea was to display my positions as a 3D scatter plot and color their density via a KDE. I coded this up as follows with test data:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
mu, sigma = 0, 0.1
x = np.random.normal(mu, sigma, 1000)
y = np.random.normal(mu, sigma, 1000)
z = np.random.normal(mu, sigma, 1000)
xyz = np.vstack([x,y,z])
density = stats.gaussian_kde(xyz)(xyz)
idx = density.argsort()
x, y, z, density = x[idx], y[idx], z[idx], density[idx]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c=density)
plt.show()
This works well! However, my real data contains many thousands of data points and calculating the kde and the scatter plot becomes extremely slow.
A small sample of my real data:
My research would suggest that a better option is to evaluate the gaussian kde on a grid. I’m just not sure how to this in 3D:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
mu, sigma = 0, 0.1
x = np.random.normal(mu, sigma, 1000)
y = np.random.normal(mu, sigma, 1000)
nbins = 50
xy = np.vstack([x,y])
density = stats.gaussian_kde(xy)
xi, yi = np.mgrid[x.min():x.max():nbins*1j, y.min():y.max():nbins*1j]
di = density(np.vstack([xi.flatten(), yi.flatten()]))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.pcolormesh(xi, yi, di.reshape(xi.shape))
plt.show()

Thanks to mwaskon for suggesting the mayavi library.
I recreated the density scatter plot in mayavi as follows:
import numpy as np
from scipy import stats
from mayavi import mlab
mu, sigma = 0, 0.1
x = 10*np.random.normal(mu, sigma, 5000)
y = 10*np.random.normal(mu, sigma, 5000)
z = 10*np.random.normal(mu, sigma, 5000)
xyz = np.vstack([x,y,z])
kde = stats.gaussian_kde(xyz)
density = kde(xyz)
# Plot scatter with mayavi
figure = mlab.figure('DensityPlot')
pts = mlab.points3d(x, y, z, density, scale_mode='none', scale_factor=0.07)
mlab.axes()
mlab.show()
Setting the scale_mode to 'none' prevents glyphs from being scaled in proportion to the density vector. In addition for large datasets, I disabled scene rendering and used a mask to reduce the number of points.
# Plot scatter with mayavi
figure = mlab.figure('DensityPlot')
figure.scene.disable_render = True
pts = mlab.points3d(x, y, z, density, scale_mode='none', scale_factor=0.07)
mask = pts.glyph.mask_points
mask.maximum_number_of_points = x.size
mask.on_ratio = 1
pts.glyph.mask_input_points = True
figure.scene.disable_render = False
mlab.axes()
mlab.show()
Next, to evaluate the gaussian kde on a grid:
import numpy as np
from scipy import stats
from mayavi import mlab
mu, sigma = 0, 0.1
x = 10*np.random.normal(mu, sigma, 5000)
y = 10*np.random.normal(mu, sigma, 5000)
z = 10*np.random.normal(mu, sigma, 5000)
xyz = np.vstack([x,y,z])
kde = stats.gaussian_kde(xyz)
# Evaluate kde on a grid
xmin, ymin, zmin = x.min(), y.min(), z.min()
xmax, ymax, zmax = x.max(), y.max(), z.max()
xi, yi, zi = np.mgrid[xmin:xmax:30j, ymin:ymax:30j, zmin:zmax:30j]
coords = np.vstack([item.ravel() for item in [xi, yi, zi]])
density = kde(coords).reshape(xi.shape)
# Plot scatter with mayavi
figure = mlab.figure('DensityPlot')
grid = mlab.pipeline.scalar_field(xi, yi, zi, density)
min = density.min()
max=density.max()
mlab.pipeline.volume(grid, vmin=min, vmax=min + .5*(max-min))
mlab.axes()
mlab.show()
As a final improvement I sped up the evaluation of kensity density function by calling the kde function in parallel.
import numpy as np
from scipy import stats
from mayavi import mlab
import multiprocessing
def calc_kde(data):
return kde(data.T)
mu, sigma = 0, 0.1
x = 10*np.random.normal(mu, sigma, 5000)
y = 10*np.random.normal(mu, sigma, 5000)
z = 10*np.random.normal(mu, sigma, 5000)
xyz = np.vstack([x,y,z])
kde = stats.gaussian_kde(xyz)
# Evaluate kde on a grid
xmin, ymin, zmin = x.min(), y.min(), z.min()
xmax, ymax, zmax = x.max(), y.max(), z.max()
xi, yi, zi = np.mgrid[xmin:xmax:30j, ymin:ymax:30j, zmin:zmax:30j]
coords = np.vstack([item.ravel() for item in [xi, yi, zi]])
# Multiprocessing
cores = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=cores)
results = pool.map(calc_kde, np.array_split(coords.T, 2))
density = np.concatenate(results).reshape(xi.shape)
# Plot scatter with mayavi
figure = mlab.figure('DensityPlot')
grid = mlab.pipeline.scalar_field(xi, yi, zi, density)
min = density.min()
max=density.max()
mlab.pipeline.volume(grid, vmin=min, vmax=min + .5*(max-min))
mlab.axes()
mlab.show()

Related

Multivariate KDE Scipy Stats - what if it's not Gaussian?

I have some 2D data that I am smoothing using:
from scipy.stats import gaussian_kde
kde = gaussian_kde(data)
but what if my data isn't Gaussian/tophat/the other options? Mine looks more elliptical before smoothing, so should I really have a different bandwidth in x and then y? The variance in one direction is a lot higher, and also the values of the x axis are higher, so it feels like a simple Gaussian might miss something?
This is what I get with your defined X and Y. Seems good. Were you expecting something different?
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def generate(n):
# generate data
np.random.seed(42)
x = np.random.normal(size=n, loc=1, scale=0.01)
np.random.seed(1)
y = np.random.normal(size=n, loc=200, scale=100)
return x, y
x, y = generate(100)
xmin = x.min()
xmax = x.max()
ymin = y.min()
ymax = y.max()
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([x, y])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
fig, ax = plt.subplots(figsize=(7, 7))
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
extent=[xmin, xmax, ymin, ymax],
aspect='auto', alpha=.75
)
ax.plot(x, y, 'ko', ms=5)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.show()
The distributions of x and y are Gaussian.
You can verify with seaborn too
import pandas as pd
import seaborn as sns
# I pass a DataFrame because passing
# (x,y) alone will be soon deprecated
g = sns.jointplot(data=pd.DataFrame({'x':x, 'y':y}), x='x', y='y')
g.plot_joint(sns.kdeplot, color="r", zorder=0, levels=6)
update
Kernel Density Estimate of 2-dimensional data is done separately along each axis and then join together.
Let's make an example with the dataset we already used.
As we can see in the seaborn jointplot, you have not only the estimated 2d-kde but also marginal distributions of x and y (the histograms).
So, step by step, let's estimate the density of x and y and then evaluate the density over a linearspace
kde_x = sps.gaussian_kde(x)
kde_x_space = np.linspace(x.min(), x.max(), 100)
kde_x_eval = kde_x.evaluate(kde_x_space)
kde_x_eval /= kde_x_eval.sum()
kde_y = sps.gaussian_kde(y)
kde_y_space = np.linspace(y.min(), y.max(), 100)
kde_y_eval = kde_y.evaluate(kde_y_space)
kde_y_eval /= kde_y_eval.sum()
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(kde_x_space, kde_x_eval, 'k.')
ax[0].set(title='KDE of x')
ax[1].plot(kde_y_space, kde_y_eval, 'k.')
ax[1].set(title='KDE of y')
plt.show()
So we now have the marginal distributions of x and y. These are probability density functions so, the joint-probability of x and y can be seen as the intersection of independent events x and y, thus we can multiply the estimated probability density of x and y in a 2d-matrix and plot on 3d projection
# Grid of x and y
X, Y = np.meshgrid(kde_x_space, kde_y_space)
# Grid of probability density
kX, kY = np.meshgrid(kde_x_eval, kde_y_eval)
# Intersection
Z = kX * kY
fig, ax = plt.subplots(
2, 2,
subplot_kw={"projection": "3d"},
figsize=(10, 10))
for i, (elev, anim, title) in enumerate(zip([10, 10, 25, 25],
[0, -90, 25, -25],
['y axis', 'x axis', 'view 1', 'view 2']
)):
# Plot the surface.
surf = ax.flat[i].plot_surface(X, Y, Z, cmap=plt.cm.gist_earth_r,
linewidth=0, antialiased=False, alpha=.75)
ax.flat[i].scatter(x, y, zs=0, zdir='z', c='k')
ax.flat[i].set(
xlabel='x', ylabel='y',
title=title
)
ax.flat[i].view_init(elev=elev, azim=anim)
plt.show()
This is a very simple and naif method but only to have an idea on how it works and why x and y scales don't matter for a 2d-KDE.

Plot surface with binary colormap

I would like to make a 3d plot of a surface parametrised by a function, and I would like the surface to be of one color (say white) where it is above some value a, and of another color (say black) where it is below a.
Here is the code to generate and plot the surface (the way the surface is generated is not important, it could be a much simpler function):
from __future__ import division
import numpy as np
import time,random
random.seed(-2)
def build_spden(N,M, alpha):
#computes the spectral density in momentum space
sp_den = np.zeros((N,M))
for k1 in prange(-N//2, N//2):
for k2 in prange(-M//2, M//2):
sp_den[k1,k2] = np.abs(2*(np.cos(2*np.pi*k1/N)+np.cos(2*np.pi*k2/M)-2))
sp_den[0,0]=1
return 1/sp_den**(alpha/2)
def gaussian_field(N,M,alpha):
'''Builds a correlated gaussian field on a surface NxM'''
spectral_density = build_spden(N,M, alpha)
# FFT of gaussian noise:
noise_real = np.random.normal(0, 1, size = (N,M))
noise_fourier = np.fft.fft2(noise_real)
# Add correlations by Fourier Filtering Method:
convolution = noise_fourier*np.sqrt(spectral_density)
# Take IFFT and exclude residual complex part
correlated_noise = np.fft.ifft2(convolution).real
# Return normalized field
return correlated_noise * (np.sqrt(N*M)/np.sqrt(np.sum(spectral_density)) )
#PLOT
N = 2**5
alpha = .75
a = -.1985
surf = gaussian_field(N,N,alpha)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
x = np.outer(np.arange(0, N), np.ones(N))
y = x.copy().T # transpose
z = surf
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot_surface(x, y, z,alpha=.4) #plot the surface
z2 = a*np.ones((N,N))
ax.plot_surface(x, y, z2, alpha=0.9) #plot a plane z = a.
plt.show()
The output is:
I would therefore like the surface to be white above the plane and black below.
Many thanks !
You can define a custom color map and pass to plot_surface:
from matplotlib.colors import ListedColormap, BoundaryNorm
cmap = ListedColormap(['r', 'b'])
norm = BoundaryNorm([z.min(), a, z.max()], cmap.N)
ax.plot_surface(x, y, z, cmap=cmap, norm=norm, alpha=.4) #plot the surface
z2 = a*np.ones((N,N))
ax.plot_surface(x, y, z2, colalpha=0.9) #plot a plane z = a.
plt.show()
Output:

How to take into account the data's uncertainty (standard deviation) when fitting with scipy.linalg.lstsq?

I am trying to surface fit 3d data (z is a function of x and y). I have assymetrical error bars for each point. I would like the fit to take this uncertainty into account.
I am using scipy.linalg.lstsq(). It does not have any option for uncertainties in its arguments.
I am trying to adapt some code found on this page.
import numpy as np
import scipy.linalg
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
# Create data with x and y random over [-2, 2], and z a Gaussian function of x and y.
np.random.seed(12345)
x = 2 * (np.random.random(500) - 0.5)
y = 2 * (np.random.random(500) - 0.5)
def f(x, y):
return np.exp(-(x + y ** 2))
z = f(x, y)
data = np.c_[x,y,z]
# regular grid covering the domain of the data
mn = np.min(data, axis=0)
mx = np.max(data, axis=0)
X,Y = np.meshgrid(np.linspace(mn[0], mx[0], 20), np.linspace(mn[1], mx[1], 20))
XX = X.flatten()
YY = Y.flatten()
# best-fit quadratic curve (2nd-order)
A = np.c_[np.ones(data.shape[0]), data[:,:2], np.prod(data[:,:2], axis=1), data[:,:2]**2]
C,_,_,_ = scipy.linalg.lstsq(A, data[:,2])
# evaluate it on a grid
Z = np.dot(np.c_[np.ones(XX.shape), XX, YY, XX*YY, XX**2, YY**2], C).reshape(X.shape)
# plot points and fitted surface using Matplotlib
fig = plt.figure(figsize=(10, 10))
ax = fig.gca(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, alpha=0.2)
ax.scatter(data[:,0], data[:,1], data[:,2], c='r', s=50)
plt.xlabel('X')
plt.ylabel('Y')
ax.set_zlabel('Z')
ax.axis('equal')
ax.axis('tight')

putting limits to x,y,z interpolated heatmap in matplotlib

I'm looking to plot a heatmap for which I have the value (=heatmap color) z at each couple of spatial x,y coordinates but I want to mark out the z values between [z0,z1] with z0=0.0 and z1=0.4 while some of interpolated z values are under and above those boundaries.
from numpy.random import uniform, seed
from matplotlib.mlab import griddata
import matplotlib.pyplot as plt
import numpy as np
# make up data.
#npts = int(raw_input('enter # of random points to plot:'))
seed(0)
npts = 200
x = uniform(-2, 2, npts)
y = uniform(-2, 2, npts)
z = x*np.exp(-x**2 - y**2)
# define grid.
xi = np.linspace(0, 1, 1000)
yi = np.linspace(0, 1, 1000)
# grid the data.
zi = griddata(x, y, z, xi, yi, interp='linear')
# contour the gridded data, plotting dots at the nonuniform data points.
CS = plt.contourf(xi, yi, zi, 15, cmap=plt.cm.rainbow,
vmax=abs(zi).max(), vmin=-abs(zi).max())
plt.colorbar() # draw colorbar
# plot data points.
plt.show()
I would like to restrict the colorbar and heatmap color from 0.0 to 0.4 (so avoid in the heatmap and in the colorbar valies under 0.0 and above 0.4).
How to do that? Thanks
You can set the values in a numpy array to None to leave them unplotted. For example,
zmin = 0.0
zmax = 0.4
zi[(zi<zmin) | (zi>zmax)] = None
CS = plt.contourf(xi, yi, zi, 15, cmap=plt.cm.rainbow,
vmax=zmax, vmin=zmin)

Matplotlib contour from xyz data: griddata invalid index

I'm trying to do a contour plot using matplotlib of a file with the following format:
x1 y1 z1
x2 y2 z2
etc
I can load it with numpy.loadtxt to get the vectors. So far, no trouble.
I read this to learn how to plot, and can reproduce it by copy paste, so i'm sure nothin is wrong with my installation:
http://matplotlib.org/examples/pylab_examples/griddata_demo.html
I understand I have to input x and y as vector and z as an array ,which can be done with griddata. This is also what i find on this site.
The documentation says:
zi = griddata(x,y,z,xi,yi) fits a surface of the form z = f*(*x, y) to the data in the (usually) nonuniformly spaced vectors (x, y, z). griddata() interpolates this surface at the points specified by (xi, yi) to produce zi. xi and yi must describe a regular grid, can be either 1D or 2D, but must be monotonically increasing.
For the sake of the example, I have written this code:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as ml
x=np.linspace(1.,10.,20)
y=np.linspace(1.,10.,20)
z=np.linspace(1.,2.,20)
xi=np.linspace(1.,10.,10)
yi=np.linspace(1.,10.,10)
zi = ml.griddata(x,y,z,xi,yi)
However, I get the following error when it comes to the griddata:
IndexError: invalid index
So, I tried to modify a bit the exemple of the doc like following:
from matplotlib.mlab import griddata
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(-2.1,2.1,300)
y = np.linspace(-2.1,2.1,300)
z = x*np.exp(-x**2-y**2)
# define grid.
xi = np.linspace(-2.1,2.1,100)
yi = np.linspace(-2.1,2.1,200)
# grid the data.
zi = griddata(x,y,z,xi,yi,interp='linear')
And I get the same error. I don't understand what's going wrong.
Thanks for your help.
Consider:
x = np.linspace(1., 10., 20)
y = np.linspace(1., 10., 20)
z = np.linspace(1., 2., 20)
This means we know the z-values at certain points along the line x=y.
From there,
zi = ml.griddata(x,y,z,xi,yi)
is asking mlab.griddata to extrapolate the values of z for all points in a rectangular grid.
We've given a lot of information about how z varies along this line, but no information about how z varies in the perpendicular direction (away from the x = y line). An error is being raised because mlab.griddata refuses to guess.
You'll get better results if your initial x, y data are distributed more randomly:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as ml
ndata = 10
ny, nx = 100, 200
xmin, xmax = 1, 10
ymin, ymax = 1, 10
# x = np.linspace(1, 10, ndata)
# y = np.linspace(1, 10, ndata)
x = np.random.randint(xmin, xmax, ndata)
y = np.random.randint(ymin, ymax, ndata)
z = np.random.random(ndata)
xi = np.linspace(xmin, xmax, nx)
yi = np.linspace(ymin, ymax, ny)
zi = ml.griddata(x, y, z, xi, yi)
plt.contour(xi, yi, zi, 15, linewidths = 0.5, colors = 'k')
plt.pcolormesh(xi, yi, zi, cmap = plt.get_cmap('rainbow'))
plt.colorbar()
plt.scatter(x, y, marker = 'o', c = 'b', s = 5, zorder = 10)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
plt.show()
If you want mlab.griddata to extrapolate data along the line x=y to the entire grid in an arbitrary way, you could add two extra boundary points (xmin, ymax, z[0]) and (xmax,ymin,z[-1]):
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as ml
np.random.seed(8)
ndata = 10
ny, nx = 100, 200
xmin, xmax = 1, 10
ymin, ymax = 1, 10
x = np.linspace(1, 10, ndata)
y = np.linspace(1, 10, ndata)
z = np.random.random(ndata)
x = np.r_[x,xmin,xmax]
y = np.r_[y,ymax,ymin]
z = np.r_[z,z[0],z[-1]]
xi = np.linspace(xmin, xmax, nx)
yi = np.linspace(ymin, ymax, ny)
# Requires installation of natgrid
# http://sourceforge.net/projects/matplotlib/files/matplotlib-toolkits/
zi = ml.griddata(x, y, z, xi, yi, interp='nn')
# Or, without natgrid:
# zi = ml.griddata(x, y, z, xi, yi, interp='linear')
plt.contour(xi, yi, zi, 15, linewidths = 0.5, colors = 'k')
plt.pcolormesh(xi, yi, zi, cmap = plt.get_cmap('rainbow'))
plt.colorbar()
plt.scatter(x, y, marker = 'o', c = 'b', s = 10, zorder = 10)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
plt.show()
ok, I finally found the solution to plot it. For those interested, here is the trick: use the griddata from Scipy with the 'nearest' method.
from scipy.interpolate import griddata
import numpy as np
import matplotlib.pyplot as plt
x=np.linspace(1.,10.,20)
y=np.linspace(1.,10.,20)
z=z = np.random.random(20)
xi=np.linspace(1.,10.,10)
yi=np.linspace(1.,10.,10)
X,Y= np.meshgrid(xi,yi)
Z = griddata((x, y), z, (X, Y),method='nearest')
plt.contourf(X,Y,Z)

Categories

Resources