How to shade a region under a curve - python

I would like to shade a region under a curve. This is my attempt:
from scipy.stats import lognorm
import matplotlib.pyplot as plt
import numpy as np
xpoints = np.linspace(0,10,100)
plt.vlines(2, 0, lognorm.pdf(2,1), color='r', linestyles='solid')
plt.vlines(3, 0, lognorm.pdf(3,1), color='r', linestyles='solid')
plt.fill_between([2,3], [lognorm.pdf(2,1), lognorm.pdf(3,1)], color='red')
plt.plot(xpoints, lognorm.pdf(xpoints,1))
However this doesn't shade under the curve properly.
How do you do this properly?

Using where
Using the where argument of fill_between allows to select the range over which the filling should occur.
from scipy.stats import lognorm
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0, 10, 100)
y = lognorm.pdf(x, 1)
plt.vlines(2, 0, lognorm.pdf(2, 1), color='r', linestyles='solid')
plt.vlines(3, 0, lognorm.pdf(3, 1), color='r', linestyles='solid')
plt.fill_between(x, y, where=((x >= 2) & (x <= 3)), color='red')
plt.plot(x, y)
plt.show()
A problem with this may be that the point of the original curve which is closest to the boundary chosen may still be too far away, such that gaps may occur.
Worthwhile to note that if you choose the points dense enough, or for that matter, just intelligently enough, such problems would be circumvented. I.e. using 101 points, such that 2.0 and 3.0 are actually part of the data,
x = np.linspace(0, 10, 101)
would result in a nice picture:
Plotting a refined version of the curve.
It may hence make sense to reevaluate your function on a denser grid and plot it independently.
from scipy.stats import lognorm
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0, 10, 100)
y = lognorm.pdf(x, 1)
plt.vlines(2, 0, lognorm.pdf(2, 1), color='r', linestyles='solid')
plt.vlines(3, 0, lognorm.pdf(3, 1), color='r', linestyles='solid')
xf = np.linspace(2, 3, 301)
yf = lognorm.pdf(xf, 1)
plt.fill_between(xf, yf, color='red')
plt.plot(x, y)
plt.show()

you are filling based on 2 points only, try this instead:
plt.fill_between(xpoints[20:31], [lognorm.pdf(i,1) for i in xpoints[20:31]], color='red')

Related

How to compute the probability (e.g. 5%, 10%, 90%) of the Kernel density function?

I attempted to plot the kernel density distribution (Gaussian) curve along with the histogram plot of two data set in python.
However, in my script the estimation of 95% (data1: marked by red color vertical line) and 5% (data2: marked by black color vertical line) is very time-consuming, e.g. I need to test different limits [detail explanation in code, where I need to change the upper limited] to get the 95% and 5% probability of the kernel density curve.
May someone help out me here and suggest possible way out fixed this issue or another approach to plot the kernel density curve along with its 95% and 5% probability.
Thank you!
My script is here.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = result['95_24'] # data 1
data2 = result['5_24'] # data 2
def plot_prob_density(data1, data2, x_start1, x_end1):
fig, (ax1) = plt.subplots(1, 1, figsize=(6,5), sharey=False)
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Hisogram plot of data
ax1.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
ax1.hist(data2, bins=np.linspace(-20,20,40), density=True, color='k', alpha=0.4)
# kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
kd_data2 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data2)
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
kd_vals_data2 = np.exp(kd_data2.score_samples(x))
# density plot
ax1.plot(x, kd_vals_data1, color='r', label='$Na$', linewidth=2)
ax1.plot(x, kd_vals_data2, color='k', label='$Λ$', linewidth = 2)
# using the function get probability)
ax1.axvline(x=x_end1,color='red',linestyle='dashed', linewidth = 3, label='$β_{95\%}$')
ax1.axvline(x=x_start1,color='k',linestyle='dashed', linewidth = 3, label='$β_{5\%}$')
# Show the plots
ax1.set_ylabel('Probability density', fontsize=12)
ax1.set_xlabel('Beta', fontsize=12)
ax1.set_xlim([-20, 20])
ax1.set_ylim(0, 0.3)
ax1.set_yticks([0, 0.1, 0.2, 0.3])
ax1.set_xticks([-20, 20, -10, 10, 0])
ax1.legend(fontsize=12, loc='upper left', frameon=False)
fig.tight_layout()
gc.collect()
return kd_data1, kd_data2,
# Calculation of 95% and 5 % for data1 and data2 Kernel density curve
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
data2 = np.array(data2).reshape(-1, 1)
kd_data1, kd_data2= plot_prob_density(data1, data2, x_start1=-2.2, x_end1=5.3)
# ##############################
print('Beta-95%: {}'
.format(get_probability(start_value = -20,
end_value = 5.3,
eval_points = 1000,
kd = kd_data1)))
# here, I modify the end-value every time and then see teh output #value, when it reached to 95% then i took taht values as 95% #confidence, however this is very confsuing, i want to compute this 95% directly and same for 5% probbaility, computed below:
print('Beta-5%: {}\n'
.format(get_probability(start_value = -20,
end_value = -2.2,
eval_points = 1000,
kd = kd_data2)))
####################################################################
plt.savefig("Ev_test.png")
The pictorial representation is also attached here.
Histogram and kernel density plot along with its 95% and 5% probability limits highlighted with red and black vertical bold lines:
Here is the possible way out to fix this issue. Additionally, the proposed method it has error in percentile calculation, therefore i recommend not to use that:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
%matplotlib inline
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs
from scipy.stats import norm
import numpy as np
fig = plt.figure(figsize=(4, 4), dpi=300)
ax = fig.add_subplot(111)
# Plot the histogram
ax.hist(data8,bins=20,zorder=1,color="r",density=True,alpha=0.6,)
ax.hist(data7,bins=20,zorder=1,color="black",density=True,alpha=0.6,)
# kde.fit()
kde = sm.nonparametric.KDEUnivariate(data8)
kde1 = sm.nonparametric.KDEUnivariate(data7)
# Plot the KDE for various bandwidths
for bandwidth in [1.8]:
kde.fit(bw=bandwidth)
kde1.fit(bw=bandwidth)# Estimate the densities
ax.plot(kde.support, kde.density,"-",lw=2,color="r",zorder=10, alpha=0.6, label="Data1")
ax.plot(kde1.support, kde1.density,"-",lw=2,color="black",zorder=10, alpha=0.6, label="Data2")
ax.legend(loc="best")
ax.set_xlim([-20, 40])
ax.set_ylim([0, 0.3])
ax.grid(False)
# Probabilities calculation
quantiles_mesh = np.linspace(0,1,len(kde.density))
fig = plt.figure(figsize=(2, 2), dpi=300)
plt.plot(quantiles_mesh, kde.icdf)
data_1_95= np.percentile(kde1.icdf, 95)
data_2_5= np.percentile(kde2.icdf, 5)
ax.axvline(x=data_1_95,color='red',linestyle='dashed', linewidth = 2)
ax.axvline(x=data_2_5,color='k',linestyle='dashed', linewidth = 2)
#plt.savefig("KDE_Plot.png")

Subplots won't display together + griddata() returning "nan" values

I have a 9x9 matrix in which I know 8 values, at the positions contained in array points, with corresponding values in array values. I want to interpolate the unknown values and paste the result on a 200x200 image.
I'm a beginner at python and I'm struggling with the following:
My subplots won't display together. Subplot(121) and subplot(122) should create a single image with both plots next to each other (horizontally), but instead I always get two separate images. What am I doing wrong, and why?
The code below is my attempt at using griddata() to interpolate the known values on a 200x200 grid. The result, grid_z, is however completely filled with nan values, and I don't understand why or how I can solve this.
import numpy as np
from scipy.interpolate import griddata
import matplotlib.pyplot as plt
X, Y = np.mgrid[0:1:200j, 0:1:200j]
points = np.array([(3, 8),(5, 8),(4, 6),(4, 4),(2, 4),(6, 4),(3, 0),(5, 0)])
values = [ 1.82907198, 1.69794981, 1.30089053, -0.00452952, 2.32777365, 0.69508469, 2.06540834, 2.1184028 ]
grid_z = griddata(points, values, (X, Y), method='cubic')
plt.figure
plt.subplot(121)
plt.plot(points[:,0], points[:,1], 'ro', markersize=10)
plt.xlim(0, 8)
plt.ylim(0, 8)
plt.gca().set_aspect('equal', adjustable='box')
plt.title('Original')
plt.subplots(122)
plt.imshow(grid_z.T, extent=(0,8,0,8), origin='lower')
plt.title('Cubic')
plt.gcf().set_size_inches(6, 6)
plt.show()
Any help would be greatly appreciated! I've consulted dozens of similar posts online but am unable to figure out what I'm doing wrong.
To work with subplots you need to use Matplotlib's object-oriented approach (more info here).
Regarding grid_z containing NaN values, it happens because you selected the wrong discretization domain. In the code below I used X, Y = np.mgrid[2:6:200j, 0:8:200j]: here x will start from 2 and goes to 6, and y will go from 0 to 8. Note that this range covers the known points. Outside of these known points you will get Nan.
import numpy as np
from scipy.interpolate import griddata
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
X, Y = np.mgrid[2:6:200j, 0:8:200j]
points = np.array([(3, 8),(5, 8),(4, 6),(4, 4),(2, 4),(6, 4),(3, 0),(5, 0)])
values = np.array([ 1.82907198, 1.69794981, 1.30089053, -0.00452952, 2.32777365, 0.69508469, 2.06540834, 2.1184028 ])
grid_z = griddata(points, values, (X, Y), method='linear')
f, axs = plt.subplots(1, 2, tight_layout=True)
cmap = "jet"
norm = Normalize(vmin=values.min(), vmax=values.max())
scatter = axs[0].scatter(points[:,0], points[:,1], c=values, norm=norm, cmap=cmap)
axs[0].set_xlim(0, 8)
axs[0].set_ylim(0, 8)
axs[0].set_aspect('equal', adjustable='box')
axs[0].set_title('Original')
axs[1].imshow(grid_z.T, extent=(0,8,0,8), origin='lower', cmap=cmap)
axs[1].set_title('Cubic')
f.colorbar(scatter)
f.axes[2].set_ylabel("values")
plt.show()

Frequency Distribution Plot: change x-axis to interval

Dear People of the Internet
I have calculated a frequency distribution and I would now like to plot it in a certain manner. So far I have calculated and plotted the frequency distribution, but I couldn't find a solution for the endproduct I am looking for. My code with an example dataset for now is:
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
# example data
rng = np.random.RandomState(seed=12345)
a1 = stats.norm.rvs(size=1000, random_state=rng)
res = stats.relfreq(a1, numbins=34)
x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size, res.frequency.size)
# plotting
fig = plt.figure(figsize=(6, 3))
ax = fig.add_subplot(1, 1, 1)
ax.bar(x, res.frequency, width=res.binsize)
ax.set_title('Frequency Distribution of 1D Vix Returns')
ax.set_xlim([x.min(), x.max()])
ax.set_xticks(ax.get_xticks()[::1])
plt.show()
As a last step, I would like to plot the x-Axis just as in the attached picture. Instead of single number I would like to have the interval. I couldn't find a source in which this matter is resolved. Has anyone encountered the same problem or knows any source which has a solution to it? Thanks in advance
Have a look at this nice answer:
https://stackoverflow.com/a/6353051/10372616.
I added the code to your current plot.
import matplotlib.pyplot as plt
from scipy import stats # ????
import numpy as np
import pandas as pd # ????
# example data
rng = np.random.RandomState(seed=12345)
a1 = stats.norm.rvs(size=1000, random_state=rng)
res = stats.relfreq(a1, numbins=34)
x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size, res.frequency.size)
# plotting
fig = plt.figure(figsize=(6, 3))
ax = fig.add_subplot(1, 1, 1)
ax.bar(x, res.frequency, width=res.binsize)
ax.set_title('Frequency Distribution of 1D Vix Returns')
ax.set_xlim([x.min(), x.max()])
ax.set_xticks(ax.get_xticks()[::1])
# Change traditional tick labels to range labels
# ----------------------------------------------------------------
ax.set_xticklabels([]) # hide your previous x tick labels
bins = ax.get_xticks()[::1]
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for a, b, x in zip(bins, bins[1:], bin_centers):
label = '{:0.0f} to {:0.0f}'.format(a, b)
ax.annotate(label, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -10), textcoords='offset points', va='top', ha='center', rotation=90)
plt.show()
Before:
After:

Using drawstyle "steps-mid" together with x-log-scale causes step points to be non-centered

Matplotlib offers various options for the drawstyle. steps-mid does the following:
The steps variants connect the points with step-like lines, i.e. horizontal lines with vertical steps. [...]
'steps-mid': The step is halfway between the points.
This works fine when the x-scale is linear however when using a log-scale it still seems to compute the step points by averaging in data-space rather than log-space. This leads to data points not being centered between the steps.
import matplotlib.pyplot as plt
import numpy as np
x = np.logspace(0, 10, num=10)
y = np.arange(x.size) % 2
fig, ax = plt.subplots()
ax.set_xscale('log')
ax.plot(x, y, drawstyle='steps-mid', marker='s')
Is there a way to use step-like plotting together with x-log-scale such that the steps are centered between data points in log-space?
I don't know of a way other than building the steps correctly in log space yourself:
import matplotlib.pyplot as plt
import numpy as np
x = np.logspace(0, 10, num=10)
y = np.arange(x.size) % 2
def log_steps_mid(x, y, **kwargs):
x_log = np.log10(x)
x_log_mid = x_log[:-1] + np.diff(x_log)/2
x_mid = 10 ** x_log_mid
x_mid = np.hstack([x[0],
np.repeat(x_mid, 2),
x[-1]])
y_mid = np.repeat(y, 2)
ax.plot(x_mid, y_mid, **kwargs)
fig, ax = plt.subplots()
ax.set_xscale('log')
ax.plot(x, y, ls='', marker='s', color='b')
log_steps_mid(x, y, color='b')

Plotting Circular contour lines in matplotlib

I am trying to circular contour lines around an array of random values of radius. The result should be a bunch of concentric circles with different radius. However I am not too sure how to plot the theta so that for each radius, all values of theta is plotted to form a line.
import random
import numpy as np
r= sort(np.array([ random.random()*5 for i in arange(100) ]))
len(r)
theta = [t for t in linspace(0,2*pi,100)]
ax = plt.subplot(111, polar=True)
ax.plot(theta, r, 'o',color='r', linewidth=3)
ax.set_rmax(2.0)
ax.grid(True)
Thank you.
Here is a one-line addition that I think does what you want:
import random
import numpy as np
import matplotlib.pyplot as plt
r= np.sort(np.array([ random.random()*5 for i in np.arange(100) ]))
len(r)
theta = [t for t in np.linspace(0,2*np.pi,100)]
ax = plt.subplot(111, polar=True)
ax.plot(theta, r, 'o',color='r', linewidth=3)
ax.set_rmax(2.0)
ax.grid(True)
[ax.plot(theta, rcirc*np.ones(100)) for rcirc in r.max()*np.random.rand(5)]
plt.show()
A quick-and-dirty way to do it would be to use np.linspace to effectively draw a polygon (as I think you were attempting to do):
import numpy as np
from matplotlib import pyplot as plt
# some random radii
r = np.random.rand(10)
# 1000 angles linearly spaced between 0 and 2pi
t = np.linspace(0, 2 * np.pi, 1000)
# broadcast r against t to make each a (1000, 10) array
r, t = np.broadcast_arrays(r[None, :], t[:, None])
# plot the lines
fig, ax = plt.subplots(1, 1, subplot_kw={'polar':True})
ax.plot(t, r, '-')
I'm sure there must be a more elegant way to do this, though.

Categories

Resources