Related
I attempted to plot the kernel density distribution (Gaussian) curve along with the histogram plot of two data set in python.
However, in my script the estimation of 95% (data1: marked by red color vertical line) and 5% (data2: marked by black color vertical line) is very time-consuming, e.g. I need to test different limits [detail explanation in code, where I need to change the upper limited] to get the 95% and 5% probability of the kernel density curve.
May someone help out me here and suggest possible way out fixed this issue or another approach to plot the kernel density curve along with its 95% and 5% probability.
Thank you!
My script is here.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = result['95_24'] # data 1
data2 = result['5_24'] # data 2
def plot_prob_density(data1, data2, x_start1, x_end1):
fig, (ax1) = plt.subplots(1, 1, figsize=(6,5), sharey=False)
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Hisogram plot of data
ax1.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
ax1.hist(data2, bins=np.linspace(-20,20,40), density=True, color='k', alpha=0.4)
# kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
kd_data2 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data2)
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
kd_vals_data2 = np.exp(kd_data2.score_samples(x))
# density plot
ax1.plot(x, kd_vals_data1, color='r', label='$Na$', linewidth=2)
ax1.plot(x, kd_vals_data2, color='k', label='$Λ$', linewidth = 2)
# using the function get probability)
ax1.axvline(x=x_end1,color='red',linestyle='dashed', linewidth = 3, label='$β_{95\%}$')
ax1.axvline(x=x_start1,color='k',linestyle='dashed', linewidth = 3, label='$β_{5\%}$')
# Show the plots
ax1.set_ylabel('Probability density', fontsize=12)
ax1.set_xlabel('Beta', fontsize=12)
ax1.set_xlim([-20, 20])
ax1.set_ylim(0, 0.3)
ax1.set_yticks([0, 0.1, 0.2, 0.3])
ax1.set_xticks([-20, 20, -10, 10, 0])
ax1.legend(fontsize=12, loc='upper left', frameon=False)
fig.tight_layout()
gc.collect()
return kd_data1, kd_data2,
# Calculation of 95% and 5 % for data1 and data2 Kernel density curve
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
data2 = np.array(data2).reshape(-1, 1)
kd_data1, kd_data2= plot_prob_density(data1, data2, x_start1=-2.2, x_end1=5.3)
# ##############################
print('Beta-95%: {}'
.format(get_probability(start_value = -20,
end_value = 5.3,
eval_points = 1000,
kd = kd_data1)))
# here, I modify the end-value every time and then see teh output #value, when it reached to 95% then i took taht values as 95% #confidence, however this is very confsuing, i want to compute this 95% directly and same for 5% probbaility, computed below:
print('Beta-5%: {}\n'
.format(get_probability(start_value = -20,
end_value = -2.2,
eval_points = 1000,
kd = kd_data2)))
####################################################################
plt.savefig("Ev_test.png")
The pictorial representation is also attached here.
Histogram and kernel density plot along with its 95% and 5% probability limits highlighted with red and black vertical bold lines:
Here is the possible way out to fix this issue. Additionally, the proposed method it has error in percentile calculation, therefore i recommend not to use that:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
%matplotlib inline
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs
from scipy.stats import norm
import numpy as np
fig = plt.figure(figsize=(4, 4), dpi=300)
ax = fig.add_subplot(111)
# Plot the histogram
ax.hist(data8,bins=20,zorder=1,color="r",density=True,alpha=0.6,)
ax.hist(data7,bins=20,zorder=1,color="black",density=True,alpha=0.6,)
# kde.fit()
kde = sm.nonparametric.KDEUnivariate(data8)
kde1 = sm.nonparametric.KDEUnivariate(data7)
# Plot the KDE for various bandwidths
for bandwidth in [1.8]:
kde.fit(bw=bandwidth)
kde1.fit(bw=bandwidth)# Estimate the densities
ax.plot(kde.support, kde.density,"-",lw=2,color="r",zorder=10, alpha=0.6, label="Data1")
ax.plot(kde1.support, kde1.density,"-",lw=2,color="black",zorder=10, alpha=0.6, label="Data2")
ax.legend(loc="best")
ax.set_xlim([-20, 40])
ax.set_ylim([0, 0.3])
ax.grid(False)
# Probabilities calculation
quantiles_mesh = np.linspace(0,1,len(kde.density))
fig = plt.figure(figsize=(2, 2), dpi=300)
plt.plot(quantiles_mesh, kde.icdf)
data_1_95= np.percentile(kde1.icdf, 95)
data_2_5= np.percentile(kde2.icdf, 5)
ax.axvline(x=data_1_95,color='red',linestyle='dashed', linewidth = 2)
ax.axvline(x=data_2_5,color='k',linestyle='dashed', linewidth = 2)
#plt.savefig("KDE_Plot.png")
I want to know what is wrong with the code.
I just want to made a fourier transform graph and change the values by sliders.
But this is what happened to my graph.
I just want to made a Parametric EQ graph interface like this, only the graph part with the sliders
Here the source code:
import matplotlib as mpl
import matplotlib.pyplot as plt
from numpy import pi, sin
import numpy as np
from matplotlib.widgets import Slider, Button, RadioButtons
import scipy.fftpack
"""import warnings
warnings.simplefilter("ignore",np.ComplexWarning)"""
# Eq many times calcule
def fermi(A1, F1, A2, F2):
peak1 = A1 * sin(2.0 * pi * F1 * x)
pFl = A2 * sin(2.0 * pi * F2 * x)
y = peak1 + pFl
yf = scipy.fftpack.fft(y)
return yf
fig = plt.figure(figsize=(5, 5))
# Create main axis
ax = fig.add_subplot(111)
ax.set_xlim([0,30])
ax.set_ylim([-2, 10])
fig.subplots_adjust(bottom=0.5, top=0.95)
# Create axes for sliders
ax_a1 = fig.add_axes([0.3, 0.10, 0.4, 0.05])
ax_a1.spines['top'].set_visible(True)
ax_a1.spines['right'].set_visible(True)
ax_f1 = fig.add_axes([0.3, 0.01, 0.4, 0.05])
ax_f1.spines['top'].set_visible(True)
ax_f1.spines['right'].set_visible(True)
ax_a2 = fig.add_axes([0.3, 0.20, 0.4, 0.05])
ax_a2.spines['top'].set_visible(True)
ax_a2.spines['right'].set_visible(True)
ax_f2 = fig.add_axes([0.3, 0.30, 0.4, 0.05])
ax_f2.spines['top'].set_visible(True)
ax_f2.spines['right'].set_visible(True)
# Create sliders
s_a1 = Slider(ax=ax_a1, label='amp1 ', valmin=-2, valmax=6, valinit=0, valfmt=' %1.1f eV', facecolor='#cc7000')
s_f1 = Slider(ax=ax_f1, label='f1 ', valmin=0, valmax=30, valinit=1.5, valfmt=' %i K', facecolor='#cc7000')
s_a2 = Slider(ax=ax_a2, label='amp2 ', valmin=-2, valmax=6, valinit=0, valfmt=' %1.1f eV', facecolor='#cc7000')
s_f2 = Slider(ax=ax_f2, label='f2 ', valmin=0, valmax=30, valinit=3.946, valfmt=' %i K', facecolor='#cc7000')
N = 10500
T = 1.0/ 800.0
# Plot default data
x = np.linspace(-0, 30, 1000)
a1_0 = 5
f1_0 = 0
a2_0 = 0
f2_0 = 0
y = fermi(a1_0, f1_0, a2_0, f2_0)
f_d, = ax.plot(x, y, linewidth=0.5, color='#000000')
# Update values
def update(val):
aa1 = s_a1.val
ff1 = s_f1.val
aa2 = s_a2.val
ff2 = s_f2.val
f_d.set_data(x.real, (fermi(aa1, ff1, aa2, ff2)).imag)
fig.canvas.draw_idle()
s_a1.on_changed(update)
s_f1.on_changed(update)
s_a2.on_changed(update)
s_f2.on_changed(update)
plt.show()
I suspect the error is in the function fermi.
The function fermi looks fine by itself. However it is important to note that the FFT results are complex numbers. Matplotlib on the other hand plots real-valued data and doesn't know what to do with complex numbers. It then issues the warning that you saw, as it throws away the imaginary part of your data before plotting.
For your specific application it looks like the (real-valued) magnitude of the FFT might be more what you're after. If you are only ever going to need the magnitude of the FFT, you could change your fermi function to only return the computed magnitude:
yf = np.abs(scipy.fftpack.fft(y))
I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01)
Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs:
df['LTV'] is
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)
I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s.
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch, field, x_start, x_end):
plt.figure(figsize = (10, 7))
unit = 0
x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x, kd_vals_lunch, color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,color='red',linestyle='dashed')
# Show the plots
plt.xlabel(field, fontsize=15)
plt.ylabel('Probability Density', fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)
Then finding the probabilities like this:
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,
end_value = 0.01,
eval_points = 100,
kd = kd_lunch)))
However, this method does not yield the appropriate PDF values we were aiming for.
Any suggestions for alternative methods would be appreciated.
PLot:
I have used more or less similar script for my work, here is my script may be it will be helpful for you.
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1, x_start, x_end):
plt.figure(figsize = (4, 3.5))
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta', fontsize=10)
plt.ylabel('Probability Density', fontsize=10)
plt.title('02 hours window', fontsize=12)
plt.xlim(-20, 20)
plt.ylim(0, 0.3)
plt.yticks([0, 0.1, 0.2, 0.3])
plt.legend(fontsize=12, loc='upper left', frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,
end_value = 13,
eval_points = 1000,
kd = kd_data1)))
I am trying to plot both a circular histogram and a vector (overlapping) on the same polar plot, but cannot get the vector to show up.
Basically, my data set consists of the times at which unitary events occur during a repeating cycle. This data is in the array "all_phases", which is just a list of degree values for each of these events. I want to show (1) the overall distribution of events w/ a circular histogram (bins corresponding to degree ranges) and (2) a vector sum as a measure of the coherence of all of these values (treating each event as a unit vector).
I can plot either one of these things individually on the subplot titled "histo", but when I try to plot both, only the histogram shows up. I have tried playing with the z-indexes of both objects to no use. The code is:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import math
array = np.array
all_phases = [array([-38.24240218]), array([-120.51570738]), array([-23.70224663]),
array([114.9540152]), array([ 2.94523445]), array([-2.16112692]), array([-18.72274284]),
array([13.2292216]), array([-95.5659992]), array([15.69046269]), array([ 51.12022047]),
array([-89.10567276]), array([ 41.77283949]), array([-9.92584921]), array([-7.59680678]),
array([166.71824996]), array([-178.94642752]), array([-23.75819463]), array([38.69481261]),
array([-52.26651244]), array([-57.40976514]), array([33.68226762]), array([-122.1818295]),
array([ 10.17007425]), array([-38.03726335]),array([44.9504975]), array([ 134.63972923]),
array([ 63.02516932]),array([-106.54049292]), array([-25.6527599])]
number_bins = 60
bin_size = 360/number_bins
cluster_num = 1
counts, theta = np.histogram(all_phases, np.arange(-180, 180 + bin_size, bin_size), density=True)
theta = theta[:-1]+ bin_size/2.
theta = theta * np.pi / 180
a_deg = map(lambda x: np.ndarray.item(x), all_phases)
a_rad = map(lambda x: math.radians(x), a_deg)
a_cos = map(lambda x: math.cos(x), a_rad)
a_sin = map(lambda x: math.sin(x), a_rad)
uv_x = sum(a_cos)/len(a_cos)
uv_y = sum(a_sin)/len(a_sin)
uv_radius = np.sqrt((uv_x*uv_x) + (uv_y*uv_y))
uv_phase = np.angle(complex(uv_x, uv_y))
"""
plot histogram and vector sum
"""
fig = plt.figure()
ax1 = fig.add_axes([0.1, 0.16, 0.05, 0.56])
histo = fig.add_subplot(111, polar=True)
histo.yaxis.set_ticks(())
histo.arrow(0,0,0.11, 1, head_width=.01, zorder=2)
plt.suptitle("Phase distribution for Neuron #" + str(cluster_num), fontsize=15, y=.94)
plt.subplots_adjust(bottom=0.12, right=0.95, top=0.78, wspace=0.4)
width = (2*np.pi) / number_bins
bars = histo.bar(theta, counts, width = width, bottom=0.002)
for r, bar in zip(counts, bars):
bar.set_facecolor(plt.cm.jet(r / max(counts)))
bar.set_alpha(0.7)
bar.set_zorder(1)
norm = matplotlib.colors.Normalize(vmin (counts.min())*len(all_phases)*bin_size, vmax=(counts.max())*len(all_phases)*bin_size)
cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=plt.cm.jet,
orientation='vertical', norm=norm, alpha=0.4,
ticks=np.arange(0, (counts.max())*len(all_phases)*bin_size)+1, )
cb1.ax.tick_params(labelsize=9)
cb1.solids.set_rasterized(True)
cb1.set_label("# spikes")
cb1.ax.yaxis.set_label_position('left')
plt.show()
cluster_num = cluster_num + 1
vs_radius and vs_phase are the parameters for the vector sum arrow I want to put on the polar plot, which I end up calling w/ histo.arrow().
My suspicion is that it might have something to do with trying to put two things on a subplot object?
Any help or thoughts would be very much appreciated!!
The problem is that the FancyArrow that is used by Axes.arrow() does not play well with polar plots.
Instead, you could use the annotate() function to draw a simple arrow that works better in the case of polar plots.
for example:
# Compute pie slices
N = 20
theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False)
radii = 10 * np.random.rand(N)
width = np.pi / 4 * np.random.rand(N)
ax = plt.subplot(111, projection='polar')
bars = ax.bar(theta, radii, width=width, bottom=0.0)
# Use custom colors and opacity
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.viridis(r / 10.))
bar.set_alpha(0.5)
v_angle = 0.275*np.pi
v_length = 4
ax.annotate('',xy=(v_angle, v_length), xytext=(v_angle,0), xycoords='data', arrowprops=dict(width=5, color='red'))
plt.show()
As a general rule, when you deal with polar plot, you have to work just as if you were working with a linear plot. That is to say, you shouldn't try to draw your arrow from (0,0) but rather from (uv_phase, 0)
fig, ax = plt.subplots()
bars = ax.bar(theta, radii, width=width, bottom=0.0)
# Use custom colors and opacity
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.viridis(r / 10.))
bar.set_alpha(0.5)
ax.annotate('',xy=(v_angle, v_length), xytext=(v_angle,0), xycoords='data', arrowprops=dict(width=5, color='red'))
I am trying to smoothen a scatter plot shown below using SciPy's B-spline representation of 1-D curve. The data is available here.
The code I used is:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interpolate
data = np.genfromtxt("spline_data.dat", delimiter = '\t')
x = 1000 / data[:, 0]
y = data[:, 1]
x_int = np.linspace(x[0], x[-1], 100)
tck = interpolate.splrep(x, y, k = 3, s = 1)
y_int = interpolate.splev(x_int, tck, der = 0)
fig = plt.figure(figsize = (5.15,5.15))
plt.subplot(111)
plt.plot(x, y, marker = 'o', linestyle='')
plt.plot(x_int, y_int, linestyle = '-', linewidth = 0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
I tried changing the order of the spline and the smoothing condition, but I am not getting a smooth plot.
B-spline interpolation should be able to smoothen the data but what is wrong? Any alternate method to smoothen this data?
Use a larger smoothing parameter. For example, s=1000:
tck = interpolate.splrep(x, y, k=3, s=1000)
This produces:
Assuming we are dealing with noisy observations of some phenomena, Gaussian Process Regression might also be a good choice. Knowledge about the variance of the noise can be included into the parameters (nugget) and other parameters can be found using Maximum Likelihood estimation. Here's a simple example of how it could be applied:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.gaussian_process import GaussianProcess
data = np.genfromtxt("spline_data.dat", delimiter='\t')
x = 1000 / data[:, 0]
y = data[:, 1]
x_pred = np.linspace(x[0], x[-1], 100)
# <GP regression>
gp = GaussianProcess(theta0=1, thetaL=0.00001, thetaU=1000, nugget=0.000001)
gp.fit(np.atleast_2d(x).T, y)
y_pred = gp.predict(np.atleast_2d(x_pred).T)
# </GP regression>
fig = plt.figure(figsize=(5.15, 5.15))
plt.subplot(111)
plt.plot(x, y, marker='o', linestyle='')
plt.plot(x_pred, y_pred, linestyle='-', linewidth=0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
which will give:
In your specific case, you could also try changing the last argument of the np.linspace function to a smaller number, np.linspace(x[0], x[-1], 10), for example.
Demo code:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interpolate
data = np.random.rand(100,2)
tempx = list(data[:, 0])
tempy = list(data[:, 1])
x = np.array(sorted([point*10 + tempx.index(point) for point in tempx]))
y = np.array([point*10 + tempy.index(point) for point in tempy])
x_int = np.linspace(x[0], x[-1], 10)
tck = interpolate.splrep(x, y, k = 3, s = 1)
y_int = interpolate.splev(x_int, tck, der = 0)
fig = plt.figure(figsize = (5.15,5.15))
plt.subplot(111)
plt.plot(x, y, marker = 'o', linestyle='')
plt.plot(x_int, y_int, linestyle = '-', linewidth = 0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
You could also smooth the data with a rolling_mean in pandas:
import pandas as pd
data = [...(your data here)...]
smoothendData = pd.rolling_mean(data,5)
the second argument of rolling_mean is the moving average (rolling mean) period. You can also reverse the data 'data.reverse', take a rolling_mean of the data that way, and combine it with the forward rolling mean. Another option is exponentially weighted moving averages:
Pandas: Exponential smoothing function for column
or using bandpass filters:
fft bandpass filter in python
http://docs.scipy.org/doc/scipy/reference/signal.html