I'm trying to do a little bit of distribution plotting and fitting in Python using SciPy for stats and matplotlib for the plotting. I'm having good luck with some things like creating a histogram:
myHist = hist(data, 100, normed=True)
I can even take the same gamma parameters and plot the line function of the probability distribution function (after some googling):
rv = ss.gamma(5,100,22)
x = np.linspace(0,600)
h = plt.plot(x, rv.pdf(x))
How would I go about plotting the histogram myHist with the PDF line h superimposed on top of the histogram? I'm hoping this is trivial, but I have been unable to figure it out.
just put both pieces together.
import scipy.stats as ss
import numpy as np
import matplotlib.pyplot as plt
alpha, loc, beta=5, 100, 22
myHist = plt.hist(data, 100, normed=True)
rv = ss.gamma(alpha,loc,beta)
x = np.linspace(0,600)
h = plt.plot(x, rv.pdf(x), lw=2)
to make sure you get what you want in any specific plot instance, try to create a figure object first
import scipy.stats as ss
import numpy as np
import matplotlib.pyplot as plt
# setting up the axes
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
# now plot
alpha, loc, beta=5, 100, 22
myHist = ax.hist(data, 100, normed=True)
rv = ss.gamma(alpha,loc,beta)
x = np.linspace(0,600)
h = ax.plot(x, rv.pdf(x), lw=2)
# show
One could be interested in plotting the distibution function of any histogram.
This can be done using seaborn kde function
import numpy as np # for random data
import pandas as pd # for convinience
import matplotlib.pyplot as plt # for graphics
import seaborn as sns # for nicer graphics
v1 = pd.Series(np.random.normal(0,10,1000), name='v1')
v2 = pd.Series(2*v1 + np.random.normal(60,15,1000), name='v2')
# plot a kernel density estimation over a stacked barchart
plt.hist([v1, v2], histtype='barstacked', normed=True);
v3 = np.concatenate((v1,v2))
from a coursera course on data visualization with python
Expanding on Malik's answer, and trying to stick with vanilla NumPy, SciPy and Matplotlib. I've pulled in Seaborn, but it's only used to provide nicer defaults and small visual tweaks:
import numpy as np
import scipy.stats as sps
import matplotlib.pyplot as plt
import seaborn as sns
# parameterise our distributions
d1 = sps.norm(0, 10)
d2 = sps.norm(60, 15)
# sample values from above distributions
y1 = d1.rvs(300)
y2 = d2.rvs(200)
# combine mixture
ys = np.concatenate([y1, y2])
# create new figure with size given explicitly
plt.figure(figsize=(10, 6))
# add histogram showing individual components
plt.hist([y1, y2], 31, histtype='barstacked', density=True, alpha=0.4, edgecolor='none')
# get X limits and fix them
mn, mx = plt.xlim()
plt.xlim(mn, mx)
# add our distributions to figure
x = np.linspace(mn, mx, 301)
plt.plot(x, d1.pdf(x) * (len(y1) / len(ys)), color='C0', ls='--', label='d1')
plt.plot(x, d2.pdf(x) * (len(y2) / len(ys)), color='C1', ls='--', label='d2')
# estimate Kernel Density and plot
kde = sps.gaussian_kde(ys)
plt.plot(x, kde.pdf(x), label='KDE')
# finish up
plt.ylabel('Probability density')
gives us the following plot:
I've tried to stick with a minimal feature set while producing relatively nice output, notably using SciPy to estimate the KDE is very easy.
I attempted to plot the kernel density distribution (Gaussian) curve along with the histogram plot of two data set in python.
However, in my script the estimation of 95% (data1: marked by red color vertical line) and 5% (data2: marked by black color vertical line) is very time-consuming, e.g. I need to test different limits [detail explanation in code, where I need to change the upper limited] to get the 95% and 5% probability of the kernel density curve.
May someone help out me here and suggest possible way out fixed this issue or another approach to plot the kernel density curve along with its 95% and 5% probability.
Thank you!
My script is here.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = result['95_24'] # data 1
data2 = result['5_24'] # data 2
def plot_prob_density(data1, data2, x_start1, x_end1):
fig, (ax1) = plt.subplots(1, 1, figsize=(6,5), sharey=False)
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Hisogram plot of data
ax1.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
ax1.hist(data2, bins=np.linspace(-20,20,40), density=True, color='k', alpha=0.4)
# kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
kd_data2 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data2)
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
kd_vals_data2 = np.exp(kd_data2.score_samples(x))
# density plot
ax1.plot(x, kd_vals_data1, color='r', label='$Na$', linewidth=2)
ax1.plot(x, kd_vals_data2, color='k', label='$Λ$', linewidth = 2)
# using the function get probability)
ax1.axvline(x=x_end1,color='red',linestyle='dashed', linewidth = 3, label='$β_{95\%}$')
ax1.axvline(x=x_start1,color='k',linestyle='dashed', linewidth = 3, label='$β_{5\%}$')
# Show the plots
ax1.set_ylabel('Probability density', fontsize=12)
ax1.set_xlabel('Beta', fontsize=12)
ax1.set_xlim([-20, 20])
ax1.set_ylim(0, 0.3)
ax1.set_yticks([0, 0.1, 0.2, 0.3])
ax1.set_xticks([-20, 20, -10, 10, 0])
ax1.legend(fontsize=12, loc='upper left', frameon=False)
return kd_data1, kd_data2,
# Calculation of 95% and 5 % for data1 and data2 Kernel density curve
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
data2 = np.array(data2).reshape(-1, 1)
kd_data1, kd_data2= plot_prob_density(data1, data2, x_start1=-2.2, x_end1=5.3)
# ##############################
print('Beta-95%: {}'
.format(get_probability(start_value = -20,
end_value = 5.3,
eval_points = 1000,
kd = kd_data1)))
# here, I modify the end-value every time and then see teh output #value, when it reached to 95% then i took taht values as 95% #confidence, however this is very confsuing, i want to compute this 95% directly and same for 5% probbaility, computed below:
print('Beta-5%: {}\n'
.format(get_probability(start_value = -20,
end_value = -2.2,
eval_points = 1000,
kd = kd_data2)))
The pictorial representation is also attached here.
Histogram and kernel density plot along with its 95% and 5% probability limits highlighted with red and black vertical bold lines:
Here is the possible way out to fix this issue. Additionally, the proposed method it has error in percentile calculation, therefore i recommend not to use that:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
%matplotlib inline
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs
from scipy.stats import norm
import numpy as np
fig = plt.figure(figsize=(4, 4), dpi=300)
ax = fig.add_subplot(111)
# Plot the histogram
# kde.fit()
kde = sm.nonparametric.KDEUnivariate(data8)
kde1 = sm.nonparametric.KDEUnivariate(data7)
# Plot the KDE for various bandwidths
for bandwidth in [1.8]:
kde1.fit(bw=bandwidth)# Estimate the densities
ax.plot(kde.support, kde.density,"-",lw=2,color="r",zorder=10, alpha=0.6, label="Data1")
ax.plot(kde1.support, kde1.density,"-",lw=2,color="black",zorder=10, alpha=0.6, label="Data2")
ax.set_xlim([-20, 40])
ax.set_ylim([0, 0.3])
# Probabilities calculation
quantiles_mesh = np.linspace(0,1,len(kde.density))
fig = plt.figure(figsize=(2, 2), dpi=300)
plt.plot(quantiles_mesh, kde.icdf)
data_1_95= np.percentile(kde1.icdf, 95)
data_2_5= np.percentile(kde2.icdf, 5)
ax.axvline(x=data_1_95,color='red',linestyle='dashed', linewidth = 2)
ax.axvline(x=data_2_5,color='k',linestyle='dashed', linewidth = 2)
I have multiple .plx files that contain two column of numbers formatted as strings (1.plx , 2.plx...)
I managed to modify a code to load the data, convert it to floats, and plot it with the appropriate colorbar, but there are two issues I couldn't solve:
The color of the lines does not update
The lines rendering looks wrong (probably due to duplicates)
I want to try to avoid that rendering problem by plotting a numpy matrix, so I want to :
Load the data
store it in a numpy matrix (outside the loop so that I can do other data processing stuff)
create a 2D plot with the colorbar
Here is my attempt and the result:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
IdVg = [IdVg for IdVg in os.listdir() if IdVg.endswith(".plx")]
n_lines = 20
steps = np.linspace(0.1, 50, 20)
norm = mpl.colors.Normalize(vmin=steps.min(), vmax=steps.max())
cmap = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.BuPu)
for i in IdVg:
x, y = np.loadtxt(i, delimiter=' ', unpack=True, skiprows= 1, dtype=str)
x = x.astype(np.float64)
y = y.astype(np.float64)
for z, ai in enumerate(steps.T): # Problem here, I want to store x, y values in a 40XN matrix
# (x1, y1, x2, y2...x20, y20) and find a way to plot them
# using Matplotlib and numpy
plt.plot(x, y, c=cmap.to_rgba(z+1))
plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
plt.xlabel('$V_{GS}$ (V)', fontsize=14)
plt.ylabel('$I_{DS}$ (A)', fontsize=14)
plt.tick_params(axis='both', labelsize='12')
plt.grid(True, which="both", ls="-")
plt.colorbar(cmap, ticks=steps)
Thanks !
Since you didn't provide data, I'm going to generate my own. I assume you want to obtain the following result:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
n_lines = 20
steps = np.linspace(0.1, 50, 20)
norm = mpl.colors.Normalize(vmin=steps.min(), vmax=steps.max())
norm_steps = norm(steps)
cmap = mpl.cm.BuPu
x = np.linspace(0, np.pi / 2)
for i in range(n_lines):
y = i / n_lines * np.sin(x)
plt.plot(x, y, c=cmap(norm_steps[i]))
plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
plt.xlabel('$V_{GS}$ (V)', fontsize=14)
plt.ylabel('$I_{DS}$ (A)', fontsize=14)
plt.tick_params(axis='both', labelsize='12')
plt.grid(True, which="both", ls="-")
plt.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.BuPu), ticks=steps)
Obviously, you would have to change the colormap to something more readable in the lower values!
Following simple code:
import numpy as np
import seaborn as sns
dist = np.random.normal(loc=0, scale=1, size=1000)
ax = sns.kdeplot(dist, shade=True);
Yields the following image:
I would like to only shade everything right (or left to some x value). What is the simplest way? I am ready to use something other than Seaborn.
After calling ax = sns.kdeplot(dist, shade=True), the last line in ax.get_lines() corresponds to the kde density curve:
ax = sns.kdeplot(dist, shade=True)
line = ax.get_lines()[-1]
You can extract the data corresponding to that curve using line.get_data:
x, y = line.get_data()
Once you have the data, you can, for instance, shade the region corresponding to x > 0 by selecting those points and calling ax.fill_between:
mask = x > 0
x, y = x[mask], y[mask]
ax.fill_between(x, y1=y, alpha=0.5, facecolor='red')
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
dist = np.random.normal(loc=0, scale=1, size=1000)
ax = sns.kdeplot(dist, shade=True)
line = ax.get_lines()[-1]
x, y = line.get_data()
mask = x > 0
x, y = x[mask], y[mask]
ax.fill_between(x, y1=y, alpha=0.5, facecolor='red')
Using seaborn is often fine for standard plots, but when some customized requirements come into play, falling back to matplotlib is often easier.
So one may first calculate the kernel density estimate and then plot it in the region of interest.
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
dist = np.random.normal(loc=0, scale=1, size=1000)
kde = stats.gaussian_kde(dist)
# plot complete kde curve as line
pos = np.linspace(dist.min(), dist.max(), 101)
plt.plot(pos, kde(pos))
# plot shaded kde only right of x=0.5
shade = np.linspace(0.5,dist.max(), 101)
plt.fill_between(shade,kde(shade), alpha=0.5)
I'm using Matplotlib's function hist2d() and I want to unpack the output in order to further use it. Here's what I do: I simply load with numpy a 2-column file containing my data and use the following code
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
traj = np.loadtxt('trajectory.txt')
x = traj[:,0]
y = traj[:,1]
M, xe, ye, img = plt.hist2d(x, y, bins = 80, norm = LogNorm())
The result I get is the following:
Instead, if I try to directly plot the hist2d results without unpacking them:
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
traj = np.loadtxt('trajectory.txt')
x = traj[:,0]
y = traj[:,1]
plt.hist2d(x, y, bins = 80, norm = LogNorm())
I get the whole plot without the strange blue box. What am I doing wrong?
You can create a histogram plot directly with plt.hist2d. This calculates the histogram and plots it to the current axes. There is no need to show it yet another time using imshow.
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np; np.random.seed(9)
x = np.random.rayleigh(size=9900)
y = np.random.rayleigh(size=9900)
M, xe, ye, img = plt.hist2d(x, y, bins = 80, norm = LogNorm())
Or, you may first calculate the histogram and afterwards plot the result as an image to the current axes. Note that the histogram produced by numpy is transposed, see Matplotlib 2D histogram seems transposed, making it necessary to call imshow(M.T). Also note that in order to obtain the correct axes labeling, you need to set the imshow's extent to the extremal values of the xe and ye edge arrays.
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np; np.random.seed(9)
x = np.random.rayleigh(size=9900)
y = np.random.rayleigh(size=9900)
M, xe, ye = np.histogram2d(x, y, bins = 80)
extent = [xe[0], xe[-1], ye[0], ye[-1]]
plt.imshow(M.T, extent=extent, norm = LogNorm(), origin="lower")
I'd like to make a scatter plot where each point is colored by the spatial density of nearby points.
I've come across a very similar question, which shows an example of this using R:
R Scatter Plot: symbol color represents number of overlapping points
What's the best way to accomplish something similar in python using matplotlib?
In addition to hist2d or hexbin as #askewchan suggested, you can use the same method that the accepted answer in the question you linked to uses.
If you want to do that:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=100)
If you'd like the points to be plotted in order of density so that the densest points are always on top (similar to the linked example), just sort them by the z-values. I'm also going to use a smaller marker size here as it looks a bit better:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Generate fake data
x = np.random.normal(size=1000)
y = x * 3 + np.random.normal(size=1000)
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=50)
Plotting >100k data points?
The accepted answer, using gaussian_kde() will take a lot of time. On my machine, 100k rows took about 11 minutes. Here I will add two alternative methods (mpl-scatter-density and datashader) and compare the given answers with same dataset.
In the following, I used a test data set of 100k rows:
import matplotlib.pyplot as plt
import numpy as np
# Fake data for testing
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
Output & computation time comparison
Below is a comparison of different methods.
1: mpl-scatter-density
pip install mpl-scatter-density
Example code
import mpl_scatter_density # adds projection='scatter_density'
from matplotlib.colors import LinearSegmentedColormap
# "Viridis-like" colormap with white background
white_viridis = LinearSegmentedColormap.from_list('white_viridis', [
(0, '#ffffff'),
(1e-20, '#440053'),
(0.2, '#404388'),
(0.4, '#2a788e'),
(0.6, '#21a784'),
(0.8, '#78d151'),
(1, '#fde624'),
], N=256)
def using_mpl_scatter_density(fig, x, y):
ax = fig.add_subplot(1, 1, 1, projection='scatter_density')
density = ax.scatter_density(x, y, cmap=white_viridis)
fig.colorbar(density, label='Number of points per pixel')
fig = plt.figure()
using_mpl_scatter_density(fig, x, y)
Drawing this took 0.05 seconds:
And the zoom-in looks quite nice:
2: datashader
Datashader is an interesting project. It has added support for matplotlib in datashader 0.12.
pip install datashader
Code (source & parameterer listing for dsshow):
import datashader as ds
from datashader.mpl_ext import dsshow
import pandas as pd
def using_datashader(ax, x, y):
df = pd.DataFrame(dict(x=x, y=y))
dsartist = dsshow(
ds.Point("x", "y"),
fig, ax = plt.subplots()
using_datashader(ax, x, y)
It took 0.83 s to draw this:
There is also possibility to colorize by third variable. The third parameter for dsshow controls the coloring. See more examples here and the source for dsshow here.
3: scatter_with_gaussian_kde
def scatter_with_gaussian_kde(ax, x, y):
# https://stackoverflow.com/a/20107592/3015186
# Answer by Joel Kington
xy = np.vstack([x, y])
z = gaussian_kde(xy)(xy)
ax.scatter(x, y, c=z, s=100, edgecolor='')
It took 11 minutes to draw this:
4: using_hist2d
import matplotlib.pyplot as plt
def using_hist2d(ax, x, y, bins=(50, 50)):
# https://stackoverflow.com/a/20105673/3015186
# Answer by askewchan
ax.hist2d(x, y, bins, cmap=plt.cm.jet)
It took 0.021 s to draw this bins=(50,50):
It took 0.173 s to draw this bins=(1000,1000):
Cons: The zoomed-in data does not look as good as in with mpl-scatter-density or datashader. Also you have to determine the number of bins yourself.
5: density_scatter
The code is as in the answer by Guillaume.
It took 0.073 s to draw this with bins=(50,50):
It took 0.368 s to draw this with bins=(1000,1000):
Also, if the number of point makes KDE calculation too slow, color can be interpolated in np.histogram2d [Update in response to comments: If you wish to show the colorbar, use plt.scatter() instead of ax.scatter() followed by plt.colorbar()]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from scipy.interpolate import interpn
def density_scatter( x , y, ax = None, sort = True, bins = 20, **kwargs ) :
Scatter plot colored by 2d histogram
if ax is None :
fig , ax = plt.subplots()
data , x_e, y_e = np.histogram2d( x, y, bins = bins, density = True )
z = interpn( ( 0.5*(x_e[1:] + x_e[:-1]) , 0.5*(y_e[1:]+y_e[:-1]) ) , data , np.vstack([x,y]).T , method = "splinef2d", bounds_error = False)
#To be sure to plot all data
z[np.where(np.isnan(z))] = 0.0
# Sort the points by density, so that the densest points are plotted last
if sort :
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
ax.scatter( x, y, c=z, **kwargs )
norm = Normalize(vmin = np.min(z), vmax = np.max(z))
cbar = fig.colorbar(cm.ScalarMappable(norm = norm), ax=ax)
return ax
if "__main__" == __name__ :
x = np.random.normal(size=100000)
y = x * 3 + np.random.normal(size=100000)
density_scatter( x, y, bins = [30,30] )
You could make a histogram:
import numpy as np
import matplotlib.pyplot as plt
# fake data:
a = np.random.normal(size=1000)
b = a*3 + np.random.normal(size=1000)
plt.hist2d(a, b, (50, 50), cmap=plt.cm.jet)