Calculating PDF given a histogram

Calculating PDF given a histogram - python

I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01)
Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs:
df['LTV'] is
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)
I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s.
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch, field, x_start, x_end):
plt.figure(figsize = (10, 7))
unit = 0
x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x, kd_vals_lunch, color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,color='red',linestyle='dashed')
# Show the plots
plt.xlabel(field, fontsize=15)
plt.ylabel('Probability Density', fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)
Then finding the probabilities like this:
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,
end_value = 0.01,
eval_points = 100,
kd = kd_lunch)))
However, this method does not yield the appropriate PDF values we were aiming for.
Any suggestions for alternative methods would be appreciated.
PLot:

I have used more or less similar script for my work, here is my script may be it will be helpful for you.
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1, x_start, x_end):
plt.figure(figsize = (4, 3.5))
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta', fontsize=10)
plt.ylabel('Probability Density', fontsize=10)
plt.title('02 hours window', fontsize=12)
plt.xlim(-20, 20)
plt.ylim(0, 0.3)
plt.yticks([0, 0.1, 0.2, 0.3])
plt.legend(fontsize=12, loc='upper left', frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,
end_value = 13,
eval_points = 1000,
kd = kd_data1)))

Related

How to compute the probability (e.g. 5%, 10%, 90%) of the Kernel density function?

I attempted to plot the kernel density distribution (Gaussian) curve along with the histogram plot of two data set in python.
However, in my script the estimation of 95% (data1: marked by red color vertical line) and 5% (data2: marked by black color vertical line) is very time-consuming, e.g. I need to test different limits [detail explanation in code, where I need to change the upper limited] to get the 95% and 5% probability of the kernel density curve.
May someone help out me here and suggest possible way out fixed this issue or another approach to plot the kernel density curve along with its 95% and 5% probability.
Thank you!
My script is here.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = result['95_24'] # data 1
data2 = result['5_24'] # data 2
def plot_prob_density(data1, data2, x_start1, x_end1):
fig, (ax1) = plt.subplots(1, 1, figsize=(6,5), sharey=False)
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Hisogram plot of data
ax1.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
ax1.hist(data2, bins=np.linspace(-20,20,40), density=True, color='k', alpha=0.4)
# kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
kd_data2 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data2)
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
kd_vals_data2 = np.exp(kd_data2.score_samples(x))
# density plot
ax1.plot(x, kd_vals_data1, color='r', label='$Na$', linewidth=2)
ax1.plot(x, kd_vals_data2, color='k', label='$Λ$', linewidth = 2)
# using the function get probability)
ax1.axvline(x=x_end1,color='red',linestyle='dashed', linewidth = 3, label='$β_{95\%}$')
ax1.axvline(x=x_start1,color='k',linestyle='dashed', linewidth = 3, label='$β_{5\%}$')
# Show the plots
ax1.set_ylabel('Probability density', fontsize=12)
ax1.set_xlabel('Beta', fontsize=12)
ax1.set_xlim([-20, 20])
ax1.set_ylim(0, 0.3)
ax1.set_yticks([0, 0.1, 0.2, 0.3])
ax1.set_xticks([-20, 20, -10, 10, 0])
ax1.legend(fontsize=12, loc='upper left', frameon=False)
fig.tight_layout()
gc.collect()
return kd_data1, kd_data2,
# Calculation of 95% and 5 % for data1 and data2 Kernel density curve
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
data2 = np.array(data2).reshape(-1, 1)
kd_data1, kd_data2= plot_prob_density(data1, data2, x_start1=-2.2, x_end1=5.3)
# ##############################
print('Beta-95%: {}'
.format(get_probability(start_value = -20,
end_value = 5.3,
eval_points = 1000,
kd = kd_data1)))
# here, I modify the end-value every time and then see teh output #value, when it reached to 95% then i took taht values as 95% #confidence, however this is very confsuing, i want to compute this 95% directly and same for 5% probbaility, computed below:
print('Beta-5%: {}\n'
.format(get_probability(start_value = -20,
end_value = -2.2,
eval_points = 1000,
kd = kd_data2)))
####################################################################
plt.savefig("Ev_test.png")
The pictorial representation is also attached here.
Histogram and kernel density plot along with its 95% and 5% probability limits highlighted with red and black vertical bold lines:

Here is the possible way out to fix this issue. Additionally, the proposed method it has error in percentile calculation, therefore i recommend not to use that:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
%matplotlib inline
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs
from scipy.stats import norm
import numpy as np
fig = plt.figure(figsize=(4, 4), dpi=300)
ax = fig.add_subplot(111)
# Plot the histogram
ax.hist(data8,bins=20,zorder=1,color="r",density=True,alpha=0.6,)
ax.hist(data7,bins=20,zorder=1,color="black",density=True,alpha=0.6,)
# kde.fit()
kde = sm.nonparametric.KDEUnivariate(data8)
kde1 = sm.nonparametric.KDEUnivariate(data7)
# Plot the KDE for various bandwidths
for bandwidth in [1.8]:
kde.fit(bw=bandwidth)
kde1.fit(bw=bandwidth)# Estimate the densities
ax.plot(kde.support, kde.density,"-",lw=2,color="r",zorder=10, alpha=0.6, label="Data1")
ax.plot(kde1.support, kde1.density,"-",lw=2,color="black",zorder=10, alpha=0.6, label="Data2")
ax.legend(loc="best")
ax.set_xlim([-20, 40])
ax.set_ylim([0, 0.3])
ax.grid(False)
# Probabilities calculation
quantiles_mesh = np.linspace(0,1,len(kde.density))
fig = plt.figure(figsize=(2, 2), dpi=300)
plt.plot(quantiles_mesh, kde.icdf)
data_1_95= np.percentile(kde1.icdf, 95)
data_2_5= np.percentile(kde2.icdf, 5)
ax.axvline(x=data_1_95,color='red',linestyle='dashed', linewidth = 2)
ax.axvline(x=data_2_5,color='k',linestyle='dashed', linewidth = 2)
#plt.savefig("KDE_Plot.png")

How to fill intervals under KDE curve with different colors

I am looking for a way to color the intervals below the curve with different colors; on the interval x < 0, I would like to fill the area under the curve with one color and on the interval x >= 0 with another color, like the following image:
This is the code for basic kde plot:
fig, (ax1) = plt.subplots(1, 1, figsize = ((plot_size + 1.5) * 1,(plot_size + 1.5)))
sns.kdeplot(data=pd.DataFrame(w_contrast, columns=['contrast']), x="contrast", ax=ax1);
ax1.set_xlabel(f"Dry Yield Posterior Contrast (kg)");
Is there a way to fill the area under the curve with different colors using seaborn?

seaborn is a high level api for matplotlib, so the curve will have to be calculated; similar to, but simpler than this answer.
Calculate the values for the kde curve with scipy.stats.gaussian_kde
Use matplotlib.pyplot.fill_between to fill the areas.
Use scipy.integrate.simpson to calculate the area under the curve, which will be passed to matplotlib.pyplot.annotate to annotate.
import seaborn as sns
from scipy.stats import gaussian_kde
from scipy.integrate import simps
import numpy as np
# load sample data
df = sns.load_dataset('planets')
# create the kde model
kde = gaussian_kde(df.mass.dropna())
# plot
fig, ax = plt.subplots(figsize=(9, 6))
g = sns.kdeplot(data=df.mass, ax=ax, c='k')
# remove margins; optional
g.margins(x=0, y=0)
# get the min and max of the x-axis
xmin, xmax = g.get_xlim()
# create points between the min and max
x = np.linspace(xmin, xmax, 1000)
# calculate the y values from the model
kde_y = kde(x)
# select x values below 0
x0 = x[x < 0]
# get the len, which will be used for slicing the other arrays
x0_len = len(x0)
# slice the arrays
y0 = kde_y[:x0_len]
x1 = x[x0_len:]
y1 = kde_y[x0_len:]
# calculate the area under the curves
area0 = np.round(simps(y0, x0, dx=1) * 100, 0)
area1 = np.round(simps(y1, x1, dx=1) * 100, 0)
# fill the areas
g.fill_between(x=x0, y1=y0, color='r', alpha=.5)
g.fill_between(x=x1, y1=y1, color='b', alpha=.5)
# annotate
g.annotate(f'{area0:.0f}%', xy=(-1, 0.075), xytext=(10, 0.150), arrowprops=dict(arrowstyle="->", color='r', alpha=.5))
g.annotate(f'{area1:.0f}%', xy=(1, 0.05), xytext=(10, 0.125), arrowprops=dict(arrowstyle="->", color='b', alpha=.5))

plot power spectrum of a signal with wavelet transform

Using this dataset https://philharmonia.co.uk/resources/sound-samples/ I'm trying to plot the power spectrum of note played by a specific instrument.
I'm using librosa to load the audio file and get some information with this code
import librosa
import numpy as np
import matplotlib.pyplot as plt
import pywt
y, sr = librosa.load(file_path)
duration = librosa.get_duration(y=y, sr=sr)
delta_t = duration / len(y)
t0=0
time = np.arange(0, len(y)) * delta_t + t0
I'm also following this https://ataspinar.com/2018/12/21/a-guide-for-using-the-wavelet-transform-in-machine-learning/ guide to plot the power spectrum and I'm using pywavelet library.
The problem that I have with this code is RuntimeWarning: divide by zero encountered in log2 and the plot is not shown.
def plot_wavelet(time, signal, scales,
waveletname = 'cmor',
cmap = plt.cm.seismic,
title = 'Wavelet Transform (Power Spectrum) of signal',
ylabel = 'Period (years)',
xlabel = 'Time'):
dt = time[1] - time[0]
print("dt ", dt)
[coefficients, frequencies] = pywt.cwt(signal, scales, waveletname, dt)
power = (abs(coefficients)) ** 2
period = 1. / frequencies
levels = [0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8]
contourlevels = np.log2(levels)
fig, ax = plt.subplots(figsize=(15, 10))
im = ax.contourf(time, np.log2(period), np.log2(power), contourlevels, extend='both',cmap=cmap)
ax.set_title(title, fontsize=20)
ax.set_ylabel(ylabel, fontsize=18)
ax.set_xlabel(xlabel, fontsize=18)
yticks = 2**np.arange(np.ceil(np.log2(period.min())), np.ceil(np.log2(period.max())))
ax.set_yticks(np.log2(yticks))
ax.set_yticklabels(yticks)
ax.invert_yaxis()
ylim = ax.get_ylim()
ax.set_ylim(ylim[0], -1)
cbar_ax = fig.add_axes([0.95, 0.5, 0.03, 0.25])
fig.colorbar(im, cax=cbar_ax, orientation="vertical")
plt.show()
scales = np.arange(1, 128)
plot_wavelet(time=time, signal=y, scales=scales, waveletname='gaus5')
Noting that some values in the power array are at -inf.
How can I solve?

What does `log = True` actually do in matplotlib?

My MCVE:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
#normal sample
mu, sigma = 100, 15
x = mu + sigma*np.random.randn(10000)
#histogram
n, bins, patches = plt.hist(x, 50)
plt.axis([40, 160, 0, 800])
n2, bins2, patches2 = plt.hist(x,50,log=True)
n1 = np.log(n)
now why n2 is different from n1? I thought that the log = True scaled everything logarithmically.. but it doesn't. So what is it doing? Same thing happens for bins2 and bins1 = np.log(bins).
edit
This
mu, sigma = 100, 15
x = mu + sigma*np.random.randn(10000)
hist, bin_edges = np.histogram(x,50)
nwidth = bin_edges[7]-bin_edges[6] #just difference between two random bins
plt.bar(np.delete(bin_edges,len(bin_edges)-1),hist,nwidth) #so I have the right number of bins
plt.show()
gives me
then by doing
histsq = np.sqrt(np.log(hist))
plt.bar(np.delete(bin_edges,len(bin_edges)-1),histsq,nwidth)
plt.show()
I obtain

When you set log = True, the histogram axis (not return parameters) is in log scale. The return parameters (n,bins), ie the values of the bins and the edges of the bins are the same for log = True and log = False. What that means is n==n2 and bins == bins2 are both True.
See the code below to see if that is true
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
#normal sample
mu, sigma = 100, 15
x = mu + sigma*np.random.randn(10000)
#histogram
plt.subplot(3, 1, 1)
n, bins, patches = plt.hist(x, 50)
# plt.axis([40, 160, 0, 800])
plt.subplot(3, 1, 2)
n2, bins2, patches2 = plt.hist(x,50,log=True)
if (all(n==n2) and all(bins==bins2)):
print 'Return parameters of hist with log option or without log option are always the same'
#if you want to apply two transformation on the return data and visualize it.
n1 = np.log(n)
plt.subplot(3, 1, 3)
xx = bins[1:]
yy = np.sqrt(n1)
plt.bar(xx,yy, width = xx[1]-xx[0])
# square root of inverted parabola is not a linear line, but has some curvature
x = np.arange(-1,1.1,.1)
y = 10-(x)**2 # inverted parabola
y1 = np.sqrt(y) # square of inverted parabola
fig, ax1 = plt.subplots()
ax1.plot(x, y, 'b-')
# Make the y-axis label, ticks and tick labels match the line color.
ax1.set_ylabel('inverted parabola', color='b')
ax1.tick_params('y', colors='b')
ax2 = ax1.twinx()
ax2.plot(x, y1, 'r.')
ax2.set_ylabel('sqrt of inverted parabola', color='r')
ax2.tick_params('y', colors='r')
fig.tight_layout()
plt.show()
will return
Return parameters of hist method with log option or without log option are always the same

How to smoothen data in Python?

I am trying to smoothen a scatter plot shown below using SciPy's B-spline representation of 1-D curve. The data is available here.
The code I used is:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interpolate
data = np.genfromtxt("spline_data.dat", delimiter = '\t')
x = 1000 / data[:, 0]
y = data[:, 1]
x_int = np.linspace(x[0], x[-1], 100)
tck = interpolate.splrep(x, y, k = 3, s = 1)
y_int = interpolate.splev(x_int, tck, der = 0)
fig = plt.figure(figsize = (5.15,5.15))
plt.subplot(111)
plt.plot(x, y, marker = 'o', linestyle='')
plt.plot(x_int, y_int, linestyle = '-', linewidth = 0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
I tried changing the order of the spline and the smoothing condition, but I am not getting a smooth plot.
B-spline interpolation should be able to smoothen the data but what is wrong? Any alternate method to smoothen this data?

Use a larger smoothing parameter. For example, s=1000:
tck = interpolate.splrep(x, y, k=3, s=1000)
This produces:

Assuming we are dealing with noisy observations of some phenomena, Gaussian Process Regression might also be a good choice. Knowledge about the variance of the noise can be included into the parameters (nugget) and other parameters can be found using Maximum Likelihood estimation. Here's a simple example of how it could be applied:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.gaussian_process import GaussianProcess
data = np.genfromtxt("spline_data.dat", delimiter='\t')
x = 1000 / data[:, 0]
y = data[:, 1]
x_pred = np.linspace(x[0], x[-1], 100)
# <GP regression>
gp = GaussianProcess(theta0=1, thetaL=0.00001, thetaU=1000, nugget=0.000001)
gp.fit(np.atleast_2d(x).T, y)
y_pred = gp.predict(np.atleast_2d(x_pred).T)
# </GP regression>
fig = plt.figure(figsize=(5.15, 5.15))
plt.subplot(111)
plt.plot(x, y, marker='o', linestyle='')
plt.plot(x_pred, y_pred, linestyle='-', linewidth=0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
which will give:

In your specific case, you could also try changing the last argument of the np.linspace function to a smaller number, np.linspace(x[0], x[-1], 10), for example.
Demo code:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interpolate
data = np.random.rand(100,2)
tempx = list(data[:, 0])
tempy = list(data[:, 1])
x = np.array(sorted([point*10 + tempx.index(point) for point in tempx]))
y = np.array([point*10 + tempy.index(point) for point in tempy])
x_int = np.linspace(x[0], x[-1], 10)
tck = interpolate.splrep(x, y, k = 3, s = 1)
y_int = interpolate.splev(x_int, tck, der = 0)
fig = plt.figure(figsize = (5.15,5.15))
plt.subplot(111)
plt.plot(x, y, marker = 'o', linestyle='')
plt.plot(x_int, y_int, linestyle = '-', linewidth = 0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
You could also smooth the data with a rolling_mean in pandas:
import pandas as pd
data = [...(your data here)...]
smoothendData = pd.rolling_mean(data,5)
the second argument of rolling_mean is the moving average (rolling mean) period. You can also reverse the data 'data.reverse', take a rolling_mean of the data that way, and combine it with the forward rolling mean. Another option is exponentially weighted moving averages:
Pandas: Exponential smoothing function for column
or using bandpass filters:
fft bandpass filter in python
http://docs.scipy.org/doc/scipy/reference/signal.html

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Calculating PDF given a histogram - python

Related

How to compute the probability (e.g. 5%, 10%, 90%) of the Kernel density function?

How to fill intervals under KDE curve with different colors

plot power spectrum of a signal with wavelet transform

What does `log = True` actually do in matplotlib?

How to smoothen data in Python?

Categories

Resources