Related
I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01)
Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs:
df['LTV'] is
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)
I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s.
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch, field, x_start, x_end):
plt.figure(figsize = (10, 7))
unit = 0
x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x, kd_vals_lunch, color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,color='red',linestyle='dashed')
# Show the plots
plt.xlabel(field, fontsize=15)
plt.ylabel('Probability Density', fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)
Then finding the probabilities like this:
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,
end_value = 0.01,
eval_points = 100,
kd = kd_lunch)))
However, this method does not yield the appropriate PDF values we were aiming for.
Any suggestions for alternative methods would be appreciated.
PLot:
I have used more or less similar script for my work, here is my script may be it will be helpful for you.
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1, x_start, x_end):
plt.figure(figsize = (4, 3.5))
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta', fontsize=10)
plt.ylabel('Probability Density', fontsize=10)
plt.title('02 hours window', fontsize=12)
plt.xlim(-20, 20)
plt.ylim(0, 0.3)
plt.yticks([0, 0.1, 0.2, 0.3])
plt.legend(fontsize=12, loc='upper left', frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,
end_value = 13,
eval_points = 1000,
kd = kd_data1)))
I am trying to plot both a circular histogram and a vector (overlapping) on the same polar plot, but cannot get the vector to show up.
Basically, my data set consists of the times at which unitary events occur during a repeating cycle. This data is in the array "all_phases", which is just a list of degree values for each of these events. I want to show (1) the overall distribution of events w/ a circular histogram (bins corresponding to degree ranges) and (2) a vector sum as a measure of the coherence of all of these values (treating each event as a unit vector).
I can plot either one of these things individually on the subplot titled "histo", but when I try to plot both, only the histogram shows up. I have tried playing with the z-indexes of both objects to no use. The code is:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import math
array = np.array
all_phases = [array([-38.24240218]), array([-120.51570738]), array([-23.70224663]),
array([114.9540152]), array([ 2.94523445]), array([-2.16112692]), array([-18.72274284]),
array([13.2292216]), array([-95.5659992]), array([15.69046269]), array([ 51.12022047]),
array([-89.10567276]), array([ 41.77283949]), array([-9.92584921]), array([-7.59680678]),
array([166.71824996]), array([-178.94642752]), array([-23.75819463]), array([38.69481261]),
array([-52.26651244]), array([-57.40976514]), array([33.68226762]), array([-122.1818295]),
array([ 10.17007425]), array([-38.03726335]),array([44.9504975]), array([ 134.63972923]),
array([ 63.02516932]),array([-106.54049292]), array([-25.6527599])]
number_bins = 60
bin_size = 360/number_bins
cluster_num = 1
counts, theta = np.histogram(all_phases, np.arange(-180, 180 + bin_size, bin_size), density=True)
theta = theta[:-1]+ bin_size/2.
theta = theta * np.pi / 180
a_deg = map(lambda x: np.ndarray.item(x), all_phases)
a_rad = map(lambda x: math.radians(x), a_deg)
a_cos = map(lambda x: math.cos(x), a_rad)
a_sin = map(lambda x: math.sin(x), a_rad)
uv_x = sum(a_cos)/len(a_cos)
uv_y = sum(a_sin)/len(a_sin)
uv_radius = np.sqrt((uv_x*uv_x) + (uv_y*uv_y))
uv_phase = np.angle(complex(uv_x, uv_y))
"""
plot histogram and vector sum
"""
fig = plt.figure()
ax1 = fig.add_axes([0.1, 0.16, 0.05, 0.56])
histo = fig.add_subplot(111, polar=True)
histo.yaxis.set_ticks(())
histo.arrow(0,0,0.11, 1, head_width=.01, zorder=2)
plt.suptitle("Phase distribution for Neuron #" + str(cluster_num), fontsize=15, y=.94)
plt.subplots_adjust(bottom=0.12, right=0.95, top=0.78, wspace=0.4)
width = (2*np.pi) / number_bins
bars = histo.bar(theta, counts, width = width, bottom=0.002)
for r, bar in zip(counts, bars):
bar.set_facecolor(plt.cm.jet(r / max(counts)))
bar.set_alpha(0.7)
bar.set_zorder(1)
norm = matplotlib.colors.Normalize(vmin (counts.min())*len(all_phases)*bin_size, vmax=(counts.max())*len(all_phases)*bin_size)
cb1 = matplotlib.colorbar.ColorbarBase(ax1, cmap=plt.cm.jet,
orientation='vertical', norm=norm, alpha=0.4,
ticks=np.arange(0, (counts.max())*len(all_phases)*bin_size)+1, )
cb1.ax.tick_params(labelsize=9)
cb1.solids.set_rasterized(True)
cb1.set_label("# spikes")
cb1.ax.yaxis.set_label_position('left')
plt.show()
cluster_num = cluster_num + 1
vs_radius and vs_phase are the parameters for the vector sum arrow I want to put on the polar plot, which I end up calling w/ histo.arrow().
My suspicion is that it might have something to do with trying to put two things on a subplot object?
Any help or thoughts would be very much appreciated!!
The problem is that the FancyArrow that is used by Axes.arrow() does not play well with polar plots.
Instead, you could use the annotate() function to draw a simple arrow that works better in the case of polar plots.
for example:
# Compute pie slices
N = 20
theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False)
radii = 10 * np.random.rand(N)
width = np.pi / 4 * np.random.rand(N)
ax = plt.subplot(111, projection='polar')
bars = ax.bar(theta, radii, width=width, bottom=0.0)
# Use custom colors and opacity
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.viridis(r / 10.))
bar.set_alpha(0.5)
v_angle = 0.275*np.pi
v_length = 4
ax.annotate('',xy=(v_angle, v_length), xytext=(v_angle,0), xycoords='data', arrowprops=dict(width=5, color='red'))
plt.show()
As a general rule, when you deal with polar plot, you have to work just as if you were working with a linear plot. That is to say, you shouldn't try to draw your arrow from (0,0) but rather from (uv_phase, 0)
fig, ax = plt.subplots()
bars = ax.bar(theta, radii, width=width, bottom=0.0)
# Use custom colors and opacity
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.viridis(r / 10.))
bar.set_alpha(0.5)
ax.annotate('',xy=(v_angle, v_length), xytext=(v_angle,0), xycoords='data', arrowprops=dict(width=5, color='red'))
Hello I have come across a problem where I need to generate dataset from a distribution given on a scatter plot where datapoints are mostly centred around the centre of the circle and also surrounded within particular radius of the circle.Any ideas of generating such datasets in python ?
One way of producing a distribution over a circular shape is to sample a one dimensional distribution and then stretch it over the 2 Pi circonference of a circle.
One could then decide to use a uniform or a normal distribution.
import matplotlib.pyplot as plt
import numpy as np
def dist(R=4., width=1., num=1000, uniform=True):
if uniform:
r = np.random.rand(num)*width+R
else:
r = np.random.normal(R, width, num)
phi = np.linspace(0,2.*np.pi, len(r))
x= r * np.sin(phi)
y = r* np.cos(phi)
return x,y
fig, ax = plt.subplots(ncols=2, figsize=(9,4))
ax[0].set_title("uniform")
x,y = dist()
ax[0].plot(x,y, linestyle="", marker="o", markersize="2")
x,y = dist(0,1.2, 400)
ax[0].plot(x,y, linestyle="", marker="o", markersize="2")
ax[1].set_title("normal")
x,y = dist(4,0.4, uniform=False)
ax[1].plot(x,y, linestyle="", marker="o", markersize="2")
x,y = dist(0,0.6, uniform=False)
ax[1].plot(x,y, linestyle="", marker="o", markersize="2")
for a in ax:
a.set_aspect("equal")
plt.show()
You can easily generalize random numbers with some distribution centered on a point, for example normal centered on the 0, 0.
x = np.random.normal(size=1000)
y = np.random.normal(size=1000)
plt.plot(x, y, 'o', alpha=0.6)
EDIT:
What we do is generate random points in polar coordinates. First we do a random for the angle (between 0 and 2 pi) and then we give the noise multiplying it by some random number.
n = 300
theta_out = np.random.uniform(low=0, high=2*np.pi, size=n)
noise_out = np.random.uniform(low=0.9, high=1.1, size=n)
x_out = np.cos(theta_out) * noise_out
y_out = np.sin(theta_out) * noise_out
theta_in = np.random.uniform(low=0, high=2*np.pi, size=n)
noise_in = np.random.uniform(low=0, high=0.5, size=n)
x_in = np.cos(theta_in) * noise_in
y_in = np.sin(theta_in) * noise_in
ax = plt.gca()
ax.set_aspect('equal')
plt.plot(x_out, y_out, 'o')
plt.plot(x_in, y_in, 'o')
Note that there is more density of points while the lower the radius.
I'm trying to get a set of histograms plotted, with raw count data (non-normalized to density/pdf) and a fit line. However, I can't seem to figure out how to get a fit line plotted that ISN'T normalized by a pdf function. Is there a way to plot a non-normalized line, or a function to reverse the density calculation? Right now, I've got the below code, which works for the normalized histogram and fit line.
fig, ax = plt.subplots()
x=[13.140,17.520,15.768,10.512,10.512,9.636,10.512, 9.636,11.388,7.884,7.008,7.008,9.636,11.388,7.884,7.88,16.644,42.924,17.520]
n, bins, patches = plt.hist(x, bins=10, normed=False, color='cornflowerblue', alpha=0.75)
(mu, sigma) = norm.fit(x)
y = mlab.normpdf(bins, mu, sigma)
l = plt.plot(bins, y, '-o', linewidth=2)
ax.set_xlabel('Millirems')
This is the graph i have so far, with raw count data and a normalized fit line
You could just do this by multiplying the pdf by the total area of the histogram I think?
import numpy as np
l = plt.plot(bins, y * np.sum(np.diff(bins) * n))
Maybe you want to scale the pdf by the same factor the histogram is scaled with respect to a normed one. This factor would be the area of the histogram sum(n * np.diff(bins)).
fig, ax = plt.subplots()
x = [13.140,17.520,15.768,10.512,10.512,9.636,10.512, 9.636,11.388,7.884,7.008,7.008,9.636,11.388,7.884,7.88,16.644,42.924,17.520]
n, bins, patches = plt.hist(x, bins=10, normed=False, color='cornflowerblue', alpha=0.75)
(mu, sigma) = norm.fit(x)
y = mlab.normpdf(bins, mu, sigma) * sum(n * np.diff(bins))
plt.plot(bins, y, '-o', linewidth=2)
ax.set_xlabel('Millirems')
I have two variables that I have plotted using matplotlib scatter function.
I would like to show the 68% confidence region by highlighting it in the plot. I know to show it in a histogram, but I don't know how to do it for a 2D plot like this (x vs y). In my case, the x is Mass and y is Ngal Mstar+2.
An example image of what I am looking for looks like this:
Here they have showed the 68% confidence region using dark blue and 95% confidence region using light blue.
Can it be achieved using one of thescipy.stats modules?
To plot a region between two curves, you could use pyplot.fill_between().
As for your confidence region, I was not sure what you wanted to achieve, so I exemplified with simultaneous confidence bands, by modifying the code from:
https://en.wikipedia.org/wiki/Confidence_and_prediction_bands#cite_note-2
import numpy as np
import matplotlib.pyplot as plt
import scipy.special as sp
## Sample size.
n = 50
## Predictor values.
XV = np.random.uniform(low=-4, high=4, size=n)
XV.sort()
## Design matrix.
X = np.ones((n,2))
X[:,1] = XV
## True coefficients.
beta = np.array([0, 1.], dtype=np.float64)
## True response values.
EY = np.dot(X, beta)
## Observed response values.
Y = EY + np.random.normal(size=n)*np.sqrt(20)
## Get the coefficient estimates.
u,s,vt = np.linalg.svd(X,0)
v = np.transpose(vt)
bhat = np.dot(v, np.dot(np.transpose(u), Y)/s)
## The fitted values.
Yhat = np.dot(X, bhat)
## The MSE and RMSE.
MSE = ((Y-EY)**2).sum()/(n-X.shape[1])
s = np.sqrt(MSE)
## These multipliers are used in constructing the intervals.
XtX = np.dot(np.transpose(X), X)
V = [np.dot(X[i,:], np.linalg.solve(XtX, X[i,:])) for i in range(n)]
V = np.array(V)
## The F quantile used in constructing the Scheffe interval.
QF = sp.fdtri(X.shape[1], n-X.shape[1], 0.95)
QF_2 = sp.fdtri(X.shape[1], n-X.shape[1], 0.68)
## The lower and upper bounds of the Scheffe band.
D = s*np.sqrt(X.shape[1]*QF*V)
LB,UB = Yhat-D,Yhat+D
D_2 = s*np.sqrt(X.shape[1]*QF_2*V)
LB_2,UB_2 = Yhat-D_2,Yhat+D_2
## Make the plot.
plt.clf()
plt.plot(XV, Y, 'o', ms=3, color='grey')
plt.hold(True)
a = plt.plot(XV, EY, '-', color='black', zorder = 4)
plt.fill_between(XV, LB_2, UB_2, where = UB_2 >= LB_2, facecolor='blue', alpha= 0.3, zorder = 0)
b = plt.plot(XV, LB_2, '-', color='blue', zorder=1)
plt.plot(XV, UB_2, '-', color='blue', zorder=1)
plt.fill_between(XV, LB, UB, where = UB >= LB, facecolor='blue', alpha= 0.3, zorder = 2)
b = plt.plot(XV, LB, '-', color='blue', zorder=3)
plt.plot(XV, UB, '-', color='blue', zorder=3)
d = plt.plot(XV, Yhat, '-', color='red',zorder=4)
plt.ylim([-8,8])
plt.xlim([-4,4])
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
The output looks like this:
First of all thank you #snake_charmer for your answer, but I have found a simpler way of solving the issue using curve_fit from scipy.optimize
I fit my data sample using curve_fit which gives me my best fit parameters. What it also gives me is the estimated covariance of the parameters. The diagonals of the same provide the variance of the parameter estimate. To compute one standard deviation errors on the parameters we can use np.sqrt(np.diag(pcov)) where pcov is the covariance matrix.
def fitfunc(M,p1,p2):
N = p1+( (M)*p2 )
return N
The above is the fit function I use for the data.
Now to fit the data using curve_fit
popt_1,pcov_1 = curve_fit(fitfunc,logx,logn,p0=(10.0,1.0),maxfev=2000)
p1_1 = popt_1[0]
p1_2 = popt_1[1]
sigma1 = [np.sqrt(pcov_1[0,0]),np.sqrt(pcov_1[1,1])] #THE 1 SIGMA CONFIDENCE INTERVALS
residuals1 = (logy) - fitfunc((logx),p1_1,p1_2)
xi_sq_1 = sum(residuals1**2) #THE CHI-SQUARE OF THE FIT
curve_y_1 = fitfunc((logx),p1_1,p1_2)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(logx,logy,c='r',label='$0.0<z<0.5$')
ax1.plot(logx,curve_y_1,'y')
ax1.plot(logx,fitfunc(logx,p1_1+sigma1[0],p1_2+sigma1[1]),'m',label='68% conf limits')
ax1.plot(logx,fitfunc(logx,p1_1-sigma1[0],p1_2-sigma1[1]),'m')
So just by using the square root the diagonal elements of the covariance matrix, I can obtain the 1 sigma confidence lines.