Ploting multiple Gaussians from pandas file - python

I am trying to plot multiple gaussians on one plot with different heights, widths and centers from this type of dataframe:
hight(y)
fwhM(width)
centers(x)
24.122348
1.827472
98
24.828252
4.333549
186
26.810812
1.728494
276
25.997897
1.882424
373
24.503944
2.222210
471
27.488572
1.750039
604
31.556823
3.844592
683
27.920951
0.891394
792
27.009054
1.917744
897
Any idea on how to go about it?

We will reuse the Gaussian plotter as defined in
Creating gaussians of fixed width and std
(The code is repeated here)
Data
The following code generates the above dataframe.
data = [
(24.122348, 1.827472, 98),
(24.828252, 4.333549, 186),
(26.810812, 1.728494, 276),
(25.997897, 1.882424, 373),
(24.503944, 2.222210, 471),
(27.488572, 1.750039, 604),
(31.556823, 3.844592, 683),
(27.920951, 0.891394, 792),
(27.009054, 1.917744, 897),
]
df = pd.DataFrame(data, columns=["height", "fwhm", "center"])
Gaussian
Taken from the reference post above.
import matplotlib.cm as mpl_cm
import matplotlib.colors as mpl_colors
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import cdist
class Gaussian:
def __init__(self, size):
self.size = size
self.center = np.array(self.size) / 2
self.axis = self._calculate_axis()
def _calculate_axis(self):
"""
Generate a list of rows, columns over multiple axis.
Example:
Input: size=(5, 3)
Output: [array([0, 1, 2, 3, 4]), array([[0], [1], [2]])]
"""
axis = [np.arange(size).reshape(-1, *np.ones(idx, dtype=np.uint8))
for idx, size in enumerate(self.size)]
return axis
def update_size(self, size):
""" Update the size and calculate new centers and axis. """
self.size = size
self.center = np.array(self.size) / 2
self.axis = self._calculate_axis()
def create(self, dim=1, fwhm=3, center=None):
""" Generate a gaussian distribution on the center of a certain width. """
center = center if center is not None else self.center[:dim]
distance = sum((ax - ax_center) ** 2 for ax_center, ax in zip(center, self.axis))
distribution = np.exp(-4 * np.log(2) * distance / fwhm ** 2)
return distribution
def creates(self, dim=2, fwhm=3, centers: np.ndarray = None):
""" Combines multiple gaussian distributions based on multiple centers. """
centers = np.array(centers or np.array([self.center]).T).T
indices = np.indices(self.size).reshape(dim, -1).T
distance = np.min(cdist(indices, centers, metric='euclidean'), axis=1)
distance = np.power(distance.reshape(self.size), 2)
distribution = np.exp(-4 * np.log(2) * distance / fwhm ** 2)
return distribution
#staticmethod
def plot(distribution, show=True):
""" Plotter, in case you do not know the dimensions of your distribution, or want the same interface. """
if len(distribution.shape) == 1:
return Gaussian.plot1d(distribution, show)
if len(distribution.shape) == 2:
return Gaussian.plot2d(distribution, show)
if len(distribution.shape) == 3:
return Gaussian.plot3d(distribution, show)
raise ValueError(f"Trying to plot {len(distribution.shape)}-dimensional data, "
f"Only 1D, 2D, and 3D distributions are valid.")
#staticmethod
def plot1d(distribution, show=True, vmin=None, vmax=None, cmap=None):
norm = mpl_colors.Normalize(
vmin=vmin if vmin is not None else distribution.min(),
vmax=vmax if vmin is not None else distribution.max()
)
cmap = mpl_cm.ScalarMappable(norm=norm, cmap=cmap or mpl_cm.get_cmap('jet'))
cmap.set_array(distribution)
c = [cmap.to_rgba(value) for value in distribution] # defines the color
fig, ax = plt.subplots()
ax.scatter(np.arange(len(distribution)), distribution, c=c)
ax.plot(distribution)
fig.colorbar(cmap)
if show: plt.show()
return fig
#staticmethod
def plot2d(distribution, show=True):
fig, ax = plt.subplots()
img = ax.imshow(distribution, cmap='jet')
fig.colorbar(img)
if show: plt.show()
return fig
#staticmethod
def plot3d(distribution, show=True):
m, n, c = distribution.shape
x, y, z = np.mgrid[:m, :n, :c]
out = np.column_stack((x.ravel(), y.ravel(), z.ravel(), distribution.ravel()))
x, y, z, values = np.array(list(zip(*out)))
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Standalone colorbar, directly creating colorbar on fig results in strange artifacts.
img = ax.scatter([0, 0], [0, 0], [0, 0], c=[0, 1], cmap=mpl_cm.get_cmap('jet'))
img.set_visible = False
fig.colorbar(img)
ax.scatter(x, y, z, c=values, cmap=mpl_cm.get_cmap('jet'))
if show: plt.show()
return fig
Solution
Since it is unclear to me how you want to the Gaussian distributions to interact when they are in part of multiple widths, I will assume that you want the maximum value.
Then the main logic is that we can now generate a unique Gaussian distribution for every center with given full width half maximum (fwhm), and take the maximum of all the distrubutions.
distribution = np.zeros((1200,))
df = pd.DataFrame(data, columns=["height", "fwhm", "center"])
gaussian = Gaussian(size=distribution.shape)
for idx, row in df.iterrows():
distribution = np.maximum(distribution, gaussian.create(fwhm=row.fwhm, center=[row.center]))
gaussian.plot(distribution, show=True)
Result
Edit
Since the question now asks for a different distribution, you can adjust the code in the create (and creates) method with the following to get different types of distributions:
def create(self, dim=1, fwhm=3, center=None):
""" Generate a gaussian distribution on the center of a certain width. """
center = center if center is not None else self.center[:dim]
distance = sum((ax - ax_center) for ax_center, ax in zip(center, self.axis))
distribution = sps.beta.pdf(distance / max(distance), a=3, b=100)
return distribution
Where sps.beta comes from import scipy.stats as sps, and can be changed with a gamma distribution as well. e.g. distribution = sps.gamma.pdf(distance, 10, 40).
Note that the distance is no longer squared, and that the argument fwhm, could be replaced by the parameters required for the distribution.

Related

How to find point inside an ellipse?

I'm trying to find points inside the ellipse. It is not an 'ordinary' ellipse actually it is based on average and standard deviation which is much easier than calculating eigen values in order to find confidence interval
Function is not written by me here are the sources
https://matplotlib.org/devdocs/gallery/statistics/confidence_ellipse.html
https://carstenschelp.github.io/2018/09/14/Plot_Confidence_Ellipse_001.html
Here is the code:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
x = np.array([21.5,16.3,13.7,20.0,17.4,10.4,16.9,7.0,13.8,15.2,13.8,8.2,18.0,9.4,13.2,7.2,21.2,30.2,13.5,29.8,18.3,20.2,31.1,21.5,29.8,18.0,13.1,24.1,32.5,15.4,16.1,15.0,25.9,3.0,17.0,23.6,17.6,-11.8,22.2,26.6,17.8,20.6,23.0,28.0,25.3,22.1,22.4,16.3,22.0,12.1])
y = np.array([92.4,98.2,97.6,95.9,96.5,92.1,89.6,89.4,89.2,89.4,90.2,86.7,89.5,89.9,90.2,87.6,104.0,87.3,99.4,85.4,92.8,92.0,87.9,96.2,94.1,95.2,95.6,86.3,87.6,89.5,95.0,97.1,93.0,87.8,98.9,98.2,100.1,45.4,92.1,91.6,94.7,93.9,91.4,91.1,95.7,93.8,96.4,94.1,94.0,89.1])
#function obtained from matplotlib documentation
#https://matplotlib.org/devdocs/gallery/statistics/confidence_ellipse.html
def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs):
"""
Create a plot of the covariance confidence ellipse of *x* and *y*.
Parameters
----------
x, y : array-like, shape (n, )
Input data.
ax : matplotlib.axes.Axes
The axes object to draw the ellipse into.
n_std : float
The number of standard deviations to determine the ellipse's radiuses.
**kwargs
Forwarded to `~matplotlib.patches.Ellipse`
Returns
-------
matplotlib.patches.Ellipse
"""
if x.size != y.size:
raise ValueError("x and y must be the same size")
cov = np.cov(x, y)
pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
# Using a special case to obtain the eigenvalues of this
# two-dimensionl dataset.
ell_radius_x = np.sqrt(1 + pearson)
ell_radius_y = np.sqrt(1 - pearson)
ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2,
facecolor=facecolor, **kwargs)
# Calculating the stdandard deviation of x from
# the squareroot of the variance and multiplying
# with the given number of standard deviations.
scale_x = np.sqrt(cov[0, 0]) * n_std
mean_x = np.mean(x)
# calculating the stdandard deviation of y ...
scale_y = np.sqrt(cov[1, 1]) * n_std
mean_y = np.mean(y)
transf = transforms.Affine2D() \
.rotate_deg(45) \
.scale(scale_x, scale_y) \
.translate(mean_x, mean_y)
ellipse.set_transform(transf + ax.transData)
return ax.add_patch(ellipse)
#implementation
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
ax.scatter(x,y,s=5)
ellipse = confidence_ellipse(x,y,ax,n_std=2,edgecolor='red')
plt.show()
Afterwards I tried to find get center coordinates and the points inside the ellipse as below:
ellipse.get_center()
Out:(0,0)
ellipse.contains_point([21.5,92.4])#first points in x,y arrays
Out:False
ellipse.contains_point([0,0])#get_center() result
Out:False
Ellipse plot is working fine but I need every points coordinates inside the ellipse.
What I am doing wrong? I already checked similar questions but they didn't work either.
The confidence_ellipse example function only returns an object for drawing, and the contains point will only tell you if the point is on the ellipse.
What you probably want is something like:
import math
class distribution():
def __init__(self,x,y):
self.cov = np.cov(x, y)
self.mean = np.matrix( [np.mean(x), np.mean(y)])
def dist(self, x,y):
tmp = np.matrix([x,y])
diff = self.mean - tmp
dist = diff * np.linalg.inv(self.cov) * diff.T
return math.sqrt(dist)
def is_inside(self, x,y,nstd=2.0):
if (self.dist(x,y) < nstd):
return True
else:
return False
Then you can do :
d = distribution(x,y)
d.is_inside(24.1,86.3)
Returns true.
Then, for all the points:
points = np.array(list(zip(x, y)))
points_in = list(filter(lambda p: d.is_inside(p[0],p[1]), points))
points_out = list(filter(lambda p: not d.is_inside(p[0],p[1]), points))
x_in = [ x[0] for x in points_in]
y_in = [ x[1] for x in points_in]
x_out = [ x[0] for x in points_out]
y_out = [ x[1] for x in points_out]
fig2, ax2 = plt.subplots(1, 1, figsize=(8, 8))
ax2.scatter(x_in,y_in,s=5, facecolor="green")
ax2.scatter(x_out,y_out, s=5, facecolor="red")
ellipse = confidence_ellipse(x,y,ax2,n_std=2,edgecolor='red') # this presupposes that you still have the confidence_ellipse still defined
plt.show()
And your output should look something like this:
Where the red points are more than 2 standard deviations away, and the green ones are inside.
You can plot all the x,y label on the plot.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
x = np.array([21.5,16.3,13.7,20.0,17.4,10.4,16.9,7.0,13.8,15.2,13.8,8.2,18.0,9.4,13.2,7.2,21.2,30.2,13.5,29.8,18.3,20.2,31.1,21.5,29.8,18.0,13.1,24.1,32.5,15.4,16.1,15.0,25.9,3.0,17.0,23.6,17.6,-11.8,22.2,26.6,17.8,20.6,23.0,28.0,25.3,22.1,22.4,16.3,22.0,12.1])
y = np.array([92.4,98.2,97.6,95.9,96.5,92.1,89.6,89.4,89.2,89.4,90.2,86.7,89.5,89.9,90.2,87.6,104.0,87.3,99.4,85.4,92.8,92.0,87.9,96.2,94.1,95.2,95.6,86.3,87.6,89.5,95.0,97.1,93.0,87.8,98.9,98.2,100.1,45.4,92.1,91.6,94.7,93.9,91.4,91.1,95.7,93.8,96.4,94.1,94.0,89.1])
#function obtained from matplotlib documentation
#https://matplotlib.org/devdocs/gallery/statistics/confidence_ellipse.html
def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs):
"""
Create a plot of the covariance confidence ellipse of *x* and *y*.
Parameters
----------
x, y : array-like, shape (n, )
Input data.
ax : matplotlib.axes.Axes
The axes object to draw the ellipse into.
n_std : float
The number of standard deviations to determine the ellipse's radiuses.
**kwargs
Forwarded to `~matplotlib.patches.Ellipse`
Returns
-------
matplotlib.patches.Ellipse
"""
if x.size != y.size:
raise ValueError("x and y must be the same size")
cov = np.cov(x, y)
pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
# Using a special case to obtain the eigenvalues of this
# two-dimensionl dataset.
ell_radius_x = np.sqrt(1 + pearson)
ell_radius_y = np.sqrt(1 - pearson)
ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2,
facecolor=facecolor, **kwargs)
# Calculating the stdandard deviation of x from
# the squareroot of the variance and multiplying
# with the given number of standard deviations.
scale_x = np.sqrt(cov[0, 0]) * n_std
mean_x = np.mean(x)
# calculating the stdandard deviation of y ...
scale_y = np.sqrt(cov[1, 1]) * n_std
mean_y = np.mean(y)
transf = transforms.Affine2D() \
.rotate_deg(45) \
.scale(scale_x, scale_y) \
.translate(mean_x, mean_y)
ellipse.set_transform(transf + ax.transData)
return ax.add_patch(ellipse)
#implementation
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.scatter(x,y,s=15)
ellipse = confidence_ellipse(x,y,ax,n_std=2,edgecolor='red')
# zip joins x and y coordinates in pairs
for x,y in zip(x,y):
label = f"({x},{y})"
#label = "{:.2f}".format(y) # plot just y-value of the point
# print(label) # uncomment if you want to print the points for reference
plt.annotate(label, # this is the text
(x,y), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
P.S. : You need to adjust your xytext accordingly. That is just the point where this label has plotted in the figure.
You can also print those values for your reference. Just put print(label) and it will print all the points for you.
(21.5,92.4)
(16.3,98.2)
(13.7,97.6)
(20.0,95.9)
(17.4,96.5)
(10.4,92.1)
(16.9,89.6)
(7.0,89.4)
(13.8,89.2)
(15.2,89.4)
(13.8,90.2)
(8.2,86.7)
(18.0,89.5)
(9.4,89.9)
(13.2,90.2)
(7.2,87.6)
(21.2,104.0)
(30.2,87.3)
(13.5,99.4)
(29.8,85.4)
(18.3,92.8)
(20.2,92.0)
(31.1,87.9)
(21.5,96.2)
(29.8,94.1)
(18.0,95.2)
(13.1,95.6)
(24.1,86.3)
(32.5,87.6)
(15.4,89.5)
(16.1,95.0)
(15.0,97.1)
(25.9,93.0)
(3.0,87.8)
(17.0,98.9)
(23.6,98.2)
(17.6,100.1)
(-11.8,45.4)
(22.2,92.1)
(26.6,91.6)
(17.8,94.7)
(20.6,93.9)
(23.0,91.4)
(28.0,91.1)
(25.3,95.7)
(22.1,93.8)
(22.4,96.4)
(16.3,94.1)
(22.0,94.0)
(12.1,89.1)

How to combine two outputs of KernelDensity from sklearn (Python)

I want to multiply two Kernel Density Estimates to get a composite likelihood ... The multiplication operator doesn't exist for sklearn.
I have KDR for data from 3 independent sources and I want multiply KDE from each sources. The below code is from https://scikit-learn.org/stable/auto_examples/neighbors/plot_species_kde.html#sphx-glr-auto-examples-neighbors-plot-species-kde-py and should run easily.
Although I made some small modification to get get two KDEs and then multiply them which fails.
Someone can help. Below code should run without error except the multiplication part.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_species_distributions
from sklearn.neighbors import KernelDensity
# if basemap is available, we'll use it.
# otherwise, we'll improvise later...
try:
from mpl_toolkits.basemap import Basemap
basemap = True
except ImportError:
basemap = False
def construct_grids(batch):
"""Construct the map grid from the batch object
Parameters
----------
batch : Batch object
The object returned by :func:`fetch_species_distributions`
Returns
-------
(xgrid, ygrid) : 1-D arrays
The grid corresponding to the values in batch.coverages
"""
# x,y coordinates for corner cells
xmin = batch.x_left_lower_corner + batch.grid_size
xmax = xmin + (batch.Nx * batch.grid_size)
ymin = batch.y_left_lower_corner + batch.grid_size
ymax = ymin + (batch.Ny * batch.grid_size)
# x coordinates of the grid cells
xgrid = np.arange(xmin, xmax, batch.grid_size)
# y coordinates of the grid cells
ygrid = np.arange(ymin, ymax, batch.grid_size)
return (xgrid, ygrid)
# Get matrices/arrays of species IDs and locations
data = fetch_species_distributions()
species_names = ['Bradypus Variegatus', 'Microryzomys Minutus']
Xtrain = np.vstack([data['train']['dd lat'], data['train']['dd long']]).T
ytrain = np.array([d.decode('ascii').startswith('micro')
for d in data['train']['species']], dtype='int')
Xtrain *= np.pi / 180. # Convert lat/long to radians
# Set up the data grid for the contour plot
xgrid, ygrid = construct_grids(data)
X, Y = np.meshgrid(xgrid[::5], ygrid[::5][::-1])
land_reference = data.coverages[6][::5, ::5]
land_mask = (land_reference > -9999).ravel()
xy = np.vstack([Y.ravel(), X.ravel()]).T
xy = xy[land_mask]
xy *= np.pi / 180.
# Plot map of South America with distributions of each species
fig = plt.figure()
fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05)
kde0 = KernelDensity(bandwidth=0.04, metric='haversine',
kernel='gaussian', algorithm='ball_tree')
kde0.fit(Xtrain[ytrain == 0])
kde1 = KernelDensity(bandwidth=0.04, metric='haversine',
kernel='gaussian', algorithm='ball_tree')
kde1.fit(Xtrain[ytrain == 1])
kde01=kde0*kde1
plt.subplot(1, 1, 1)
# evaluate only on the land: -9999 indicates ocean
Z = np.full(land_mask.shape[0], -9999, dtype='int')
Z[land_mask] = np.exp(kde01.score_samples(xy))
Z = Z.reshape(X.shape)
# plot contours of the density
levels = np.linspace(0, Z.max(), 25)
plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds)
plt.show()

Dividing matplotlib histogram by maximum bin value

I want to plot multiple histograms on the same plot and I need to compare the spread of the data. I want to do this by dividing each histogram by its maximum value so all the distributions have the same scale. However, the way matplotlib's histogram function works, I have not found an easy way to do this.
This is because n in
n, bins, patches = ax1.hist(y, bins = 20, histtype = 'step', color = 'k')
Is the number of counts in each bin but I can not repass this to hist since it will recalculate.
I have attempted the norm and density functions but these normalise the area of the distributions, rather than the height of the distribution. I could duplicate n and then repeat the bin edges using the bins output but this is tedious. Surely the hist function must allow for the bins values to be divided by a constant?
Example code is below, demonstrating the problem.
y1 = np.random.randn(100)
y2 = 2*np.random.randn(50)
x1 = np.linspace(1,101,100)
x2 = np.linspace(1,51,50)
gs = plt.GridSpec(1,2, wspace = 0, width_ratios = [3,1])
ax = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
ax1.yaxis.set_ticklabels([]) # remove the major ticks
ax.scatter(x1, y1, marker='+',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
ax.scatter(x2, y2, marker='o',color = 'k')#, c=SNR, cmap=plt.cm.Greys)
n1, bins1, patches1 = ax1.hist(y1, bins = 20, histtype = 'step', color = 'k',linewidth = 2, orientation = 'horizontal')
n2, bins2, patched2 = ax1.hist(y2, bins = 20, histtype = 'step', linestyle = 'dashed', color = 'k', orientation = 'horizontal')
I do not know whether matplotlib allows this normalisation by default but I wrote a function to do it myself.
It takes the output of n and bins from plt.hist (as above) and then passes this through the function below.
def hist_norm_height(n,bins,const):
''' Function to normalise bin height by a constant.
Needs n and bins from np.histogram or ax.hist.'''
n = np.repeat(n,2)
n = float32(n) / const
new_bins = [bins[0]]
new_bins.extend(np.repeat(bins[1:],2))
return n,new_bins[:-1]
To plot now (I like step histograms), you pass it to plt.step.
Such as plt.step(new_bins,n). This will give you a histogram with height normalised by a constant.
You can assign the argument bins equal to a list of values. Use np.arange() or np.linspace() to generate the values. http://matplotlib.org/api/axes_api.html?highlight=hist#matplotlib.axes.Axes.hist
Slightly different approach set up for comparisons. Could be adapted to the step style:
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
y = []
y.append(np.random.normal(2, 2, size=40))
y.append(np.random.normal(3, 1.5, size=40))
y.append(np.random.normal(4,4,size=40))
ls = ['dashed','dotted','solid']
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
for l, data in zip(ls, y):
n, b, p = ax1.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
ax2.hist(data, normed=True,
#histtype = 'step', color='k', linestyle=l,
alpha=0.2
)
n, b, p = ax3.hist(data, normed=False,
#histtype='step', #step's too much of a pain to get the bins
#color='k', linestyle=l,
alpha=0.2
)
high = float(max([r.get_height() for r in p]))
for r in p:
r.set_height(r.get_height()/high)
ax3.add_patch(r)
ax3.set_ylim(0,1)
ax1.set_title('hist')
ax2.set_title('area==1')
ax3.set_title('fix height')
plt.show()
a couple outputs:
This can be accomplished using numpy to obtain a priori histogram values, and then plotting them with a bar plot.
import numpy as np
import matplotlib.pyplot as plt
# Define random data and number of bins to use
x = np.random.randn(1000)
bins = 10
plt.figure()
# Obtain the bin values and edges using numpy
hist, bin_edges = np.histogram(x, bins=bins, density=True)
# Plot bars with the proper positioning, height, and width.
plt.bar(
(bin_edges[1:] + bin_edges[:-1]) * .5, hist / hist.max(),
width=(bin_edges[1] - bin_edges[0]), color="blue")
plt.show()

python: scatter plot with median and CI

I am looking for a python plot on the lines of http://www.r-bloggers.com/visually-weighted-watercolor-plots-new-variants-please-vote/
This gives the equivalent of the standard deviation bands:
# generate random variables
x,y = generate_random()
# bin the values and determine the envelopes
df = bin_by(x, y, nbins=25, bins = None)
###
# Plot 1
###
# determine the colors
cols = ['#EE7550', '#F19463', '#F6B176']
with plt.style.context('fivethirtyeight'):
# plot the 3rd stdv
plt.fill_between(df.x, df['5th'], df['95th'], alpha=0.7,color = cols[2])
plt.fill_between(df.x, df['10th'], df['90th'], alpha=0.7,color = cols[1])
plt.fill_between(df.x, df['25th'], df['75th'], alpha=0.7,color = cols[0])
# plt the line
plt.plot(df.x, df['median'], color = '1', alpha = 0.7, linewidth = 1)
# plot the points
plt.scatter(x, y, facecolors='white', edgecolors='0', s = 5, lw = 0.7)
plt.savefig('fig1.png', facecolor='white', edgecolor='none')
plt.show()
def bin_by(x, y, nbins=30, bins = None):
"""
Divide the x axis into sections and return groups of y based on its x value
"""
if bins is None:
bins = np.linspace(x.min(), x.max(), nbins)
bin_space = (bins[-1] - bins[0])/(len(bins)-1)/2
indicies = np.digitize(x, bins + bin_space)
Bit of a discussion and link to my Github from my blog
cut-paste from my larger piece of code. It does not give what I want. I am posting per Evert's suggestion
fig = plt.figure(figsize=(8, 8))
plt.plot(xlist, ylist, 'b,')
plt.plot([0.0,0.8],[0.0,0.8],'y-')
data2d=zip(xlist,ylist)
bins = np.linspace(0.0, 0.2, 21)
medianlist=binpercentile(data2d,bins)
c10list=binpercentile(data2d,bins,0.1)
c90list=binpercentile(data2d,bins,0.9)
centerbins=[(x+y)/2.0 for x,y in zip(bins[:-1],bins[1:])]
centerbins.insert(0,0)
medianlist.insert(0,0)
c10list.insert(0,0)
c90list.insert(0,0)
plt.plot(centerbins,c10list,'r--')
plt.plot(centerbins,c90list,'r--')
plt.plot(centerbins,medianlist,'r-')
imagefilename='%s.%s'%('.'.join(infile.split('.')[0:-1]),'diffmed.pdf')
plt.savefig(imagefilename)

Generate a heatmap using a scatter data set

I have a set of X,Y data points (about 10k) that are easy to plot as a scatter plot but that I would like to represent as a heatmap.
I looked through the examples in Matplotlib and they all seem to already start with heatmap cell values to generate the image.
Is there a method that converts a bunch of x, y, all different, to a heatmap (where zones with higher frequency of x, y would be "warmer")?
If you don't want hexagons, you can use numpy's histogram2d function:
import numpy as np
import numpy.random
import matplotlib.pyplot as plt
# Generate some test data
x = np.random.randn(8873)
y = np.random.randn(8873)
heatmap, xedges, yedges = np.histogram2d(x, y, bins=50)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.imshow(heatmap.T, extent=extent, origin='lower')
plt.show()
This makes a 50x50 heatmap. If you want, say, 512x384, you can put bins=(512, 384) in the call to histogram2d.
Example:
In Matplotlib lexicon, i think you want a hexbin plot.
If you're not familiar with this type of plot, it's just a bivariate histogram in which the xy-plane is tessellated by a regular grid of hexagons.
So from a histogram, you can just count the number of points falling in each hexagon, discretiize the plotting region as a set of windows, assign each point to one of these windows; finally, map the windows onto a color array, and you've got a hexbin diagram.
Though less commonly used than e.g., circles, or squares, that hexagons are a better choice for the geometry of the binning container is intuitive:
hexagons have nearest-neighbor symmetry (e.g., square bins don't,
e.g., the distance from a point on a square's border to a point
inside that square is not everywhere equal) and
hexagon is the highest n-polygon that gives regular plane
tessellation (i.e., you can safely re-model your kitchen floor with hexagonal-shaped tiles because you won't have any void space between the tiles when you are finished--not true for all other higher-n, n >= 7, polygons).
(Matplotlib uses the term hexbin plot; so do (AFAIK) all of the plotting libraries for R; still i don't know if this is the generally accepted term for plots of this type, though i suspect it's likely given that hexbin is short for hexagonal binning, which is describes the essential step in preparing the data for display.)
from matplotlib import pyplot as PLT
from matplotlib import cm as CM
from matplotlib import mlab as ML
import numpy as NP
n = 1e5
x = y = NP.linspace(-5, 5, 100)
X, Y = NP.meshgrid(x, y)
Z1 = ML.bivariate_normal(X, Y, 2, 2, 0, 0)
Z2 = ML.bivariate_normal(X, Y, 4, 1, 1, 1)
ZD = Z2 - Z1
x = X.ravel()
y = Y.ravel()
z = ZD.ravel()
gridsize=30
PLT.subplot(111)
# if 'bins=None', then color of each hexagon corresponds directly to its count
# 'C' is optional--it maps values to x-y coordinates; if 'C' is None (default) then
# the result is a pure 2D histogram
PLT.hexbin(x, y, C=z, gridsize=gridsize, cmap=CM.jet, bins=None)
PLT.axis([x.min(), x.max(), y.min(), y.max()])
cb = PLT.colorbar()
cb.set_label('mean value')
PLT.show()
Edit: For a better approximation of Alejandro's answer, see below.
I know this is an old question, but wanted to add something to Alejandro's anwser: If you want a nice smoothed image without using py-sphviewer you can instead use np.histogram2d and apply a gaussian filter (from scipy.ndimage.filters) to the heatmap:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.ndimage.filters import gaussian_filter
def myplot(x, y, s, bins=1000):
heatmap, xedges, yedges = np.histogram2d(x, y, bins=bins)
heatmap = gaussian_filter(heatmap, sigma=s)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
return heatmap.T, extent
fig, axs = plt.subplots(2, 2)
# Generate some test data
x = np.random.randn(1000)
y = np.random.randn(1000)
sigmas = [0, 16, 32, 64]
for ax, s in zip(axs.flatten(), sigmas):
if s == 0:
ax.plot(x, y, 'k.', markersize=5)
ax.set_title("Scatter plot")
else:
img, extent = myplot(x, y, s)
ax.imshow(img, extent=extent, origin='lower', cmap=cm.jet)
ax.set_title("Smoothing with $\sigma$ = %d" % s)
plt.show()
Produces:
The scatter plot and s=16 plotted on top of eachother for Agape Gal'lo (click for better view):
One difference I noticed with my gaussian filter approach and Alejandro's approach was that his method shows local structures much better than mine. Therefore I implemented a simple nearest neighbour method at pixel level. This method calculates for each pixel the inverse sum of the distances of the n closest points in the data. This method is at a high resolution pretty computationally expensive and I think there's a quicker way, so let me know if you have any improvements.
Update: As I suspected, there's a much faster method using Scipy's scipy.cKDTree. See Gabriel's answer for the implementation.
Anyway, here's my code:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
def data_coord2view_coord(p, vlen, pmin, pmax):
dp = pmax - pmin
dv = (p - pmin) / dp * vlen
return dv
def nearest_neighbours(xs, ys, reso, n_neighbours):
im = np.zeros([reso, reso])
extent = [np.min(xs), np.max(xs), np.min(ys), np.max(ys)]
xv = data_coord2view_coord(xs, reso, extent[0], extent[1])
yv = data_coord2view_coord(ys, reso, extent[2], extent[3])
for x in range(reso):
for y in range(reso):
xp = (xv - x)
yp = (yv - y)
d = np.sqrt(xp**2 + yp**2)
im[y][x] = 1 / np.sum(d[np.argpartition(d.ravel(), n_neighbours)[:n_neighbours]])
return im, extent
n = 1000
xs = np.random.randn(n)
ys = np.random.randn(n)
resolution = 250
fig, axes = plt.subplots(2, 2)
for ax, neighbours in zip(axes.flatten(), [0, 16, 32, 64]):
if neighbours == 0:
ax.plot(xs, ys, 'k.', markersize=2)
ax.set_aspect('equal')
ax.set_title("Scatter Plot")
else:
im, extent = nearest_neighbours(xs, ys, resolution, neighbours)
ax.imshow(im, origin='lower', extent=extent, cmap=cm.jet)
ax.set_title("Smoothing over %d neighbours" % neighbours)
ax.set_xlim(extent[0], extent[1])
ax.set_ylim(extent[2], extent[3])
plt.show()
Result:
Instead of using np.hist2d, which in general produces quite ugly histograms, I would like to recycle py-sphviewer, a python package for rendering particle simulations using an adaptive smoothing kernel and that can be easily installed from pip (see webpage documentation). Consider the following code, which is based on the example:
import numpy as np
import numpy.random
import matplotlib.pyplot as plt
import sphviewer as sph
def myplot(x, y, nb=32, xsize=500, ysize=500):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
x0 = (xmin+xmax)/2.
y0 = (ymin+ymax)/2.
pos = np.zeros([len(x),3])
pos[:,0] = x
pos[:,1] = y
w = np.ones(len(x))
P = sph.Particles(pos, w, nb=nb)
S = sph.Scene(P)
S.update_camera(r='infinity', x=x0, y=y0, z=0,
xsize=xsize, ysize=ysize)
R = sph.Render(S)
R.set_logscale()
img = R.get_image()
extent = R.get_extent()
for i, j in zip(xrange(4), [x0,x0,y0,y0]):
extent[i] += j
print extent
return img, extent
fig = plt.figure(1, figsize=(10,10))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
# Generate some test data
x = np.random.randn(1000)
y = np.random.randn(1000)
#Plotting a regular scatter plot
ax1.plot(x,y,'k.', markersize=5)
ax1.set_xlim(-3,3)
ax1.set_ylim(-3,3)
heatmap_16, extent_16 = myplot(x,y, nb=16)
heatmap_32, extent_32 = myplot(x,y, nb=32)
heatmap_64, extent_64 = myplot(x,y, nb=64)
ax2.imshow(heatmap_16, extent=extent_16, origin='lower', aspect='auto')
ax2.set_title("Smoothing over 16 neighbors")
ax3.imshow(heatmap_32, extent=extent_32, origin='lower', aspect='auto')
ax3.set_title("Smoothing over 32 neighbors")
#Make the heatmap using a smoothing over 64 neighbors
ax4.imshow(heatmap_64, extent=extent_64, origin='lower', aspect='auto')
ax4.set_title("Smoothing over 64 neighbors")
plt.show()
which produces the following image:
As you see, the images look pretty nice, and we are able to identify different substructures on it. These images are constructed spreading a given weight for every point within a certain domain, defined by the smoothing length, which in turns is given by the distance to the closer nb neighbor (I've chosen 16, 32 and 64 for the examples). So, higher density regions typically are spread over smaller regions compared to lower density regions.
The function myplot is just a very simple function that I've written in order to give the x,y data to py-sphviewer to do the magic.
If you are using 1.2.x
import numpy as np
import matplotlib.pyplot as plt
x = np.random.randn(100000)
y = np.random.randn(100000)
plt.hist2d(x,y,bins=100)
plt.show()
Seaborn now has the jointplot function which should work nicely here:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Generate some test data
x = np.random.randn(8873)
y = np.random.randn(8873)
sns.jointplot(x=x, y=y, kind='hex')
plt.show()
Here's Jurgy's great nearest neighbour approach but implemented using scipy.cKDTree. In my tests it's about 100x faster.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.spatial import cKDTree
def data_coord2view_coord(p, resolution, pmin, pmax):
dp = pmax - pmin
dv = (p - pmin) / dp * resolution
return dv
n = 1000
xs = np.random.randn(n)
ys = np.random.randn(n)
resolution = 250
extent = [np.min(xs), np.max(xs), np.min(ys), np.max(ys)]
xv = data_coord2view_coord(xs, resolution, extent[0], extent[1])
yv = data_coord2view_coord(ys, resolution, extent[2], extent[3])
def kNN2DDens(xv, yv, resolution, neighbours, dim=2):
"""
"""
# Create the tree
tree = cKDTree(np.array([xv, yv]).T)
# Find the closest nnmax-1 neighbors (first entry is the point itself)
grid = np.mgrid[0:resolution, 0:resolution].T.reshape(resolution**2, dim)
dists = tree.query(grid, neighbours)
# Inverse of the sum of distances to each grid point.
inv_sum_dists = 1. / dists[0].sum(1)
# Reshape
im = inv_sum_dists.reshape(resolution, resolution)
return im
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
for ax, neighbours in zip(axes.flatten(), [0, 16, 32, 63]):
if neighbours == 0:
ax.plot(xs, ys, 'k.', markersize=5)
ax.set_aspect('equal')
ax.set_title("Scatter Plot")
else:
im = kNN2DDens(xv, yv, resolution, neighbours)
ax.imshow(im, origin='lower', extent=extent, cmap=cm.Blues)
ax.set_title("Smoothing over %d neighbours" % neighbours)
ax.set_xlim(extent[0], extent[1])
ax.set_ylim(extent[2], extent[3])
plt.savefig('new.png', dpi=150, bbox_inches='tight')
and the initial question was... how to convert scatter values to grid values, right?
histogram2d does count the frequency per cell, however, if you have other data per cell than just the frequency, you'd need some additional work to do.
x = data_x # between -10 and 4, log-gamma of an svc
y = data_y # between -4 and 11, log-C of an svc
z = data_z #between 0 and 0.78, f1-values from a difficult dataset
So, I have a dataset with Z-results for X and Y coordinates. However, I was calculating few points outside the area of interest (large gaps), and heaps of points in a small area of interest.
Yes here it becomes more difficult but also more fun. Some libraries (sorry):
from matplotlib import pyplot as plt
from matplotlib import cm
import numpy as np
from scipy.interpolate import griddata
pyplot is my graphic engine today,
cm is a range of color maps with some initeresting choice.
numpy for the calculations,
and griddata for attaching values to a fixed grid.
The last one is important especially because the frequency of xy points is not equally distributed in my data. First, let's start with some boundaries fitting to my data and an arbitrary grid size. The original data has datapoints also outside those x and y boundaries.
#determine grid boundaries
gridsize = 500
x_min = -8
x_max = 2.5
y_min = -2
y_max = 7
So we have defined a grid with 500 pixels between the min and max values of x and y.
In my data, there are lots more than the 500 values available in the area of high interest; whereas in the low-interest-area, there are not even 200 values in the total grid; between the graphic boundaries of x_min and x_max there are even less.
So for getting a nice picture, the task is to get an average for the high interest values and to fill the gaps elsewhere.
I define my grid now. For each xx-yy pair, i want to have a color.
xx = np.linspace(x_min, x_max, gridsize) # array of x values
yy = np.linspace(y_min, y_max, gridsize) # array of y values
grid = np.array(np.meshgrid(xx, yy.T))
grid = grid.reshape(2, grid.shape[1]*grid.shape[2]).T
Why the strange shape? scipy.griddata wants a shape of (n, D).
Griddata calculates one value per point in the grid, by a predefined method.
I choose "nearest" - empty grid points will be filled with values from the nearest neighbor. This looks as if the areas with less information have bigger cells (even if it is not the case). One could choose to interpolate "linear", then areas with less information look less sharp. Matter of taste, really.
points = np.array([x, y]).T # because griddata wants it that way
z_grid2 = griddata(points, z, grid, method='nearest')
# you get a 1D vector as result. Reshape to picture format!
z_grid2 = z_grid2.reshape(xx.shape[0], yy.shape[0])
And hop, we hand over to matplotlib to display the plot
fig = plt.figure(1, figsize=(10, 10))
ax1 = fig.add_subplot(111)
ax1.imshow(z_grid2, extent=[x_min, x_max,y_min, y_max, ],
origin='lower', cmap=cm.magma)
ax1.set_title("SVC: empty spots filled by nearest neighbours")
ax1.set_xlabel('log gamma')
ax1.set_ylabel('log C')
plt.show()
Around the pointy part of the V-Shape, you see I did a lot of calculations during my search for the sweet spot, whereas the less interesting parts almost everywhere else have a lower resolution.
Make a 2-dimensional array that corresponds to the cells in your final image, called say heatmap_cells and instantiate it as all zeroes.
Choose two scaling factors that define the difference between each array element in real units, for each dimension, say x_scale and y_scale. Choose these such that all your datapoints will fall within the bounds of the heatmap array.
For each raw datapoint with x_value and y_value:
heatmap_cells[floor(x_value/x_scale),floor(y_value/y_scale)]+=1
Very similar to #Piti's answer, but using 1 call instead of 2 to generate the points:
import numpy as np
import matplotlib.pyplot as plt
pts = 1000000
mean = [0.0, 0.0]
cov = [[1.0,0.0],[0.0,1.0]]
x,y = np.random.multivariate_normal(mean, cov, pts).T
plt.hist2d(x, y, bins=50, cmap=plt.cm.jet)
plt.show()
Output:
Here's one I made on a 1 Million point set with 3 categories (colored Red, Green, and Blue). Here's a link to the repository if you'd like to try the function. Github Repo
histplot(
X,
Y,
labels,
bins=2000,
range=((-3,3),(-3,3)),
normalize_each_label=True,
colors = [
[1,0,0],
[0,1,0],
[0,0,1]],
gain=50)
I'm afraid I'm a little late to the party but I had a similar question a while ago. The accepted answer (by #ptomato) helped me out but I'd also want to post this in case it's of use to someone.
''' I wanted to create a heatmap resembling a football pitch which would show the different actions performed '''
import numpy as np
import matplotlib.pyplot as plt
import random
#fixing random state for reproducibility
np.random.seed(1234324)
fig = plt.figure(12)
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
#Ratio of the pitch with respect to UEFA standards
hmap= np.full((6, 10), 0)
#print(hmap)
xlist = np.random.uniform(low=0.0, high=100.0, size=(20))
ylist = np.random.uniform(low=0.0, high =100.0, size =(20))
#UEFA Pitch Standards are 105m x 68m
xlist = (xlist/100)*10.5
ylist = (ylist/100)*6.5
ax1.scatter(xlist,ylist)
#int of the co-ordinates to populate the array
xlist_int = xlist.astype (int)
ylist_int = ylist.astype (int)
#print(xlist_int, ylist_int)
for i, j in zip(xlist_int, ylist_int):
#this populates the array according to the x,y co-ordinate values it encounters
hmap[j][i]= hmap[j][i] + 1
#Reversing the rows is necessary
hmap = hmap[::-1]
#print(hmap)
im = ax2.imshow(hmap)
Here's the result
None of these solutions worked for my application, so this is what I came up with. Essentially I am placing a 2D Gaussian at every single point:
import cv2
import numpy as np
import matplotlib.pyplot as plt
def getGaussian2D(ksize, sigma, norm=True):
oneD = cv2.getGaussianKernel(ksize=ksize, sigma=sigma)
twoD = np.outer(oneD.T, oneD)
return twoD / np.sum(twoD) if norm else twoD
def pt2heat(pts, shape, kernel=16, sigma=5):
heat = np.zeros(shape)
k = getGaussian2D(kernel, sigma)
for y,x in pts:
x, y = int(x), int(y)
for i in range(-kernel//2, kernel//2):
for j in range(-kernel//2, kernel//2):
if 0 <= x+i < shape[0] and 0 <= y+j < shape[1]:
heat[x+i, y+j] = heat[x+i, y+j] + k[i+kernel//2, j+kernel//2]
return heat
heat = pts2heat(pts, img.shape[:2])
plt.imshow(heat, cmap='heat')
Here are the points overlayed ontop of it's associated image, along with the resulting heat map:

Categories

Resources