Using StatsModels to plot quantile regression for 2nd order polynomial

Using StatsModels to plot quantile regression for 2nd order polynomial - python

I am following the StatsModels example here to plot quantile regression lines. With only slight modification for my data, the example works great, producing this plot (note that I have modified the code to only plot the 0.05, 0.25, 0.5, 0.75, and 0.95 quantiles) :
However, I would like to plot the OLS fit and corresponding quantiles for a 2nd order polynomial fit (instead of linear). For example, here is the 2nd-order OLS line for the same data:
How can I modify the code in the linked example to produce non-linear quantiles?
Here is my relevant code modified from the linked example to produce the 1st plot:
d = {'temp': x, 'dens': y}
df = pd.DataFrame(data=d)
# Least Absolute Deviation
#
# The LAD model is a special case of quantile regression where q=0.5
mod = smf.quantreg('dens ~ temp', df)
res = mod.fit(q=.5)
print(res.summary())
# Prepare data for plotting
#
# For convenience, we place the quantile regression results in a Pandas DataFrame, and the OLS results in a dictionary.
quantiles = [.05, .25, .50, .75, .95]
def fit_model(q):
res = mod.fit(q=q)
return [q, res.params['Intercept'], res.params['temp']] + res.conf_int().ix['temp'].tolist()
models = [fit_model(x) for x in quantiles]
models = pd.DataFrame(models, columns=['q', 'a', 'b','lb','ub'])
ols = smf.ols('dens ~ temp', df).fit()
ols_ci = ols.conf_int().ix['temp'].tolist()
ols = dict(a = ols.params['Intercept'],
b = ols.params['temp'],
lb = ols_ci[0],
ub = ols_ci[1])
print(models)
print(ols)
x = np.arange(df.temp.min(), df.temp.max(), 50)
get_y = lambda a, b: a + b * x
for i in range(models.shape[0]):
y = get_y(models.a[i], models.b[i])
plt.plot(x, y, linestyle='dotted', color='grey')
y = get_y(ols['a'], ols['b'])
plt.plot(x, y, color='red', label='OLS')
plt.scatter(df.temp, df.dens, alpha=.2)
plt.xlim((-10, 40))
plt.ylim((0, 0.4))
plt.legend()
plt.xlabel('temp')
plt.ylabel('dens')
plt.show()

After a day of looking into this, came up with a solution, so posting my own answer. Much credit to Josef Perktold at StatsModels for assistance.
Here is the relevant code and plot:
d = {'temp': x, 'dens': y}
df = pd.DataFrame(data=d)
x1 = pd.DataFrame({'temp': np.linspace(df.temp.min(), df.temp.max(), 200)})
poly_2 = smf.ols(formula='dens ~ 1 + temp + I(temp ** 2.0)', data=df).fit()
plt.plot(x, y, 'o', alpha=0.2)
plt.plot(x1.temp, poly_2.predict(x1), 'r-',
label='2nd order poly fit, $R^2$=%.2f' % poly_2.rsquared,
alpha=0.9)
plt.xlim((-10, 50))
plt.ylim((0, 0.25))
plt.xlabel('mean air temp')
plt.ylabel('density')
plt.legend(loc="upper left")
# with quantile regression
# Least Absolute Deviation
# The LAD model is a special case of quantile regression where q=0.5
mod = smf.quantreg('dens ~ temp + I(temp ** 2.0)', df)
res = mod.fit(q=.5)
print(res.summary())
# Quantile regression for 5 quantiles
quantiles = [.05, .25, .50, .75, .95]
# get all result instances in a list
res_all = [mod.fit(q=q) for q in quantiles]
res_ols = smf.ols('dens ~ temp + I(temp ** 2.0)', df).fit()
plt.figure()
# create x for prediction
x_p = np.linspace(df.temp.min(), df.temp.max(), 50)
df_p = pd.DataFrame({'temp': x_p})
for qm, res in zip(quantiles, res_all):
# get prediction for the model and plot
# here we use a dict which works the same way as the df in ols
plt.plot(x_p, res.predict({'temp': x_p}), linestyle='--', lw=1,
color='k', label='q=%.2F' % qm, zorder=2)
y_ols_predicted = res_ols.predict(df_p)
plt.plot(x_p, y_ols_predicted, color='red', zorder=1)
#plt.scatter(df.temp, df.dens, alpha=.2)
plt.plot(df.temp, df.dens, 'o', alpha=.2, zorder=0)
plt.xlim((-10, 50))
plt.ylim((0, 0.25))
#plt.legend(loc="upper center")
plt.xlabel('mean air temp')
plt.ylabel('density')
plt.title('')
plt.show()
red line: 2nd order polynomial fit
black dashed lines: 5th, 25th, 50th, 75th, 95th percentiles

Related

Gaussian Fit function anomaly

I have written a code to fit the gaussian function in a dataset by scipy curve_fit. There are a few different datasets. One with 19 points and one with 21 points and both of them include different datasets in range of 0.5-0.7, 1.0-1.2 and 1.5-1.7.
Surprisingly, when I ran the code in 19 point datasets, all three of them executed successfully but in case of 21 point datasets, only 1.5-1.7 ranged data had the right fit. All others were given with horribly wrong fit.
Here is the code.
#function declaration
def gauss(x, amp, mu, sigma):
y = amp*np.exp(-(x-mu)**2/(2*sigma**2))
return y
#fitting
popt, pcov = curve_fit(f = gauss, xdata = x, ydata = y)
#print(popt)
amp = popt[0]
mu = popt[1]
sigma = popt[2]
print(amp,mu,sigma)
#krypton value
krypton_y = amp/((math.exp(1))**2)
#print(krypton_y)
krypton_x1 = mu + math.sqrt((-2*(sigma**2))*math.log(krypton_y/amp))
krypton_x2 = mu - math.sqrt((-2*(sigma**2))*math.log(krypton_y/amp))
print(krypton_x1-krypton_x2)
#print(gauss([krypton_x1, krypton_x2], popt[0], popt[1], popt[2]))
#horizontal line
horizontal_x = np.arange(min(x)-0.01, max(x)+0.02, 0.01)
horizontal_y = np.repeat(0, len(horizontal_x))
#build fit set
x_test = np.arange(min(x), max(x), 0.0000001)
y_test = gauss(x_test, popt[0], popt[1], popt[2])
y_krypton = []
for i in horizontal_x:
y_krypton.append(krypton_y)
#Vertical lines
vertical_y = np.arange(-20, amp+20, 0.01)
l = len(vertical_y)
vertical_mean = np.repeat(mu, l)
#fit data
fig = plt.figure()
fig = plt.scatter(x,y, label ='original data', color = 'red', marker = 'x')
fig = plt.plot(x_test, y_test, label = 'Gaussian fit curve')
fig = plt.plot(horizontal_x, y_krypton, color = '#830000', linewidth = 1)
fig = plt.plot(vertical_mean, vertical_y, color = '#0011ed')
fig = plt.xlabel('Distance in mm')
fig = plt.ylabel('Current in nA')
fig = plt.title('Intensity Profile for '+gas+' laser | Z = '+str(z)+'cm')
fig = plt.scatter(mu, amp, s = 25, color = '#0011ed')
fig = plt.scatter(krypton_x1, krypton_y, s = 25, color = '#830000')
fig = plt.scatter(krypton_x2, krypton_y, s = 25, color = '#830000')
plt.annotate('('+"{:.4f}".format(mu)+','+"{:.4f}".format(amp)+')', (mu, amp), xytext = (mu+0.002,amp+0.5))
plt.annotate('('+"{:.4f}".format(krypton_x1)+','+"{:.4f}".format(krypton_y)+')', (krypton_x1, krypton_y), xytext = (krypton_x1+0.002,krypton_y+0.5))
plt.annotate('('+"{:.4f}".format(krypton_x2)+','+"{:.4f}".format(krypton_y)+')', (krypton_x2, krypton_y), xytext = (krypton_x2+0.002,krypton_y+0.5))
plt.legend()
plt.margins(0)
plt.show()
I am also adding two images, the correct fit and the wrong fit.

In order to make clear the difficulty we will use an elementary regression method.
We see that the fitting involves ln(y) which is infinite at the points k<6 and k>16. Those points cannot be used for the numerical calculus. Also the point k=16 is not reliable because the small value of y=0.001 is not accurate enough (only one sigificative digit). So, we use only the points from k=6 to k=15 in the next calculus.
This shows that the non-significative points have to be eliminated. Of course more sophisticated methods implemented in nonlinear regression package with iterative calculus gives better fitting according to some particular criteria of fitting specified in the software.

Quantile regression for 2nd order polynomial using StatsModels

I have followed the examples here by PJW for plotting a 2nd order polynomial quantile regression. The OLS model seems to be a good fit for my data but the quantile lines came out really wacky and I haven't been able to figure out where the code has gone wrong. I have attached my code below and the chart with only OLS regression line and the chart with the funky quantiles. Any help would be appreciated!
Scatter graph with 2nd order polynomial, regression line in red:
Same scatter graph with an OLS 2nd order polynomial regression line (black) and quantile lines (0.05, 0.5, 0.95) that are clearly wrong (red dotted):
def plot_poly_centiles(parameter_df):
# function to plot quantile lines using polynomial regression
par_name = parameter_df.columns[1]
# plot a scatter graph of the data
plt.subplots(figsize=(10, 6))
sns.scatterplot(x='age', y=par_name, data=parameter_df, marker='.', color='blue', alpha=0.1)
model = smf.quantreg(f'{par_name} ~ age + np.power(age, 2)', parameter_df)
result = model.fit(q=0.5)
print(result.summary())
# Quantile regression for 5 quantiles
quantiles = [.05, .25, .50, .75, .95]
# get all model result instances in a list
result_all = [model.fit(q=q) for q in quantiles]
result_ols = smf.ols(f'{par_name} ~ age + np.power(age, 2)', parameter_df).fit()
# create x for prediction
x = np.arange(parameter_df.age.min(), parameter_df.age.max(), 50)
predicted_df = pd.DataFrame({'age': x})
# plot quantile lines
for qm, result in zip(quantiles, result_all):
# get prediction for the model and plot
# here we use a dict which works the same way as the df in ols
y_cent = result.predict({'age': x})
plt.plot(x, y_cent, linestyle='--', linewidth=1, color='red')
# plot ols line
y_ols_predicted = result_ols.predict(predicted_df)
plt.plot(x, y_ols_predicted, color='k', linewidth=1, label='OLS')
plt.xlabel('age in days')
plt.ylabel(f'{par_name}')
plt.title(f'Polynomial regression centiles of {par_name} in children')
plt.show()
return parameter_df

GP regression using Poisson likelihood

I am trying to implement GP regression using Poisson likelihood.
I followed the example in GPy by doing
poisson_likelihood = GPy.likelihoods.Poisson()
laplace_inf = GPy.inference.latent_function_inference.Laplace()
m = GPy.core.GP(X=X, Y=Y, likelihood=poisson_likelihood, inference_method=laplace_inf, kernel=kernel)
m.optimize()
#for ploting
pred_points = np.linspace(300,800,1000)[:, None]
#Predictive GP for log intensity mean and variance
f_mean, f_var = m._raw_predict(pred_points)
f_upper, f_lower = f_mean + 2*np.sqrt(f_var), f_mean - 2.*np.sqrt(f_var)
pb.figure(figsize=(10, 13))
pb.plot(pred_points, np.exp(f_mean), color='blue', lw=2)
pb.fill_between(pred_points[:,0], np.exp(f_lower[:,0]), np.exp(f_upper[:,0]), color='blue', alpha=.1)
pb.errorbar(Xc.flatten(), Yc.flatten(), dyc, fmt='.', color='k',markersize=8,alpha=1.0, label='Data')
When I tried to do the same using GPflow, I implemented in the following way
poisson_likelihood = gpflow.likelihoods.Poisson()
m = gpflow.models.VGP((X, Y), kernel=k, likelihood=poisson_likelihood, num_latent_gps=1)
opt = gpflow.optimizers.Scipy()
opt_logs = opt.minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=100))
#for ploting
xx = np.linspace(300, 800, 100).reshape(100, 1)
mean, var = m.predict_f(xx)
plt.plot(X, Y, "kx", mew=2)
plt.plot(xx, np.exp(mean), "C0", lw=2)
plt.fill_between(
xx[:, 0],
np.exp(mean[:, 0] - 1.96 * np.sqrt(var[:, 0])),
np.exp(mean[:, 0] + 1.96 * np.sqrt(var[:, 0])),
color="C0",
alpha=0.2,
)
When I implemented this using GP flow, the hyper parameters did not move from initialized values.
Also, I am getting very different results, am I doing something wrong?
Result with GPflow
Result with GPy

Calculating PDF given a histogram

I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01)
Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs:
df['LTV'] is
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)
I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s.
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch, field, x_start, x_end):
plt.figure(figsize = (10, 7))
unit = 0
x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x, kd_vals_lunch, color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,color='red',linestyle='dashed')
# Show the plots
plt.xlabel(field, fontsize=15)
plt.ylabel('Probability Density', fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)
Then finding the probabilities like this:
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,
end_value = 0.01,
eval_points = 100,
kd = kd_lunch)))
However, this method does not yield the appropriate PDF values we were aiming for.
Any suggestions for alternative methods would be appreciated.
PLot:

I have used more or less similar script for my work, here is my script may be it will be helpful for you.
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1, x_start, x_end):
plt.figure(figsize = (4, 3.5))
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta', fontsize=10)
plt.ylabel('Probability Density', fontsize=10)
plt.title('02 hours window', fontsize=12)
plt.xlim(-20, 20)
plt.ylim(0, 0.3)
plt.yticks([0, 0.1, 0.2, 0.3])
plt.legend(fontsize=12, loc='upper left', frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,
end_value = 13,
eval_points = 1000,
kd = kd_data1)))

Confidence regions of 1sigma for a 2D plot

I have two variables that I have plotted using matplotlib scatter function.
I would like to show the 68% confidence region by highlighting it in the plot. I know to show it in a histogram, but I don't know how to do it for a 2D plot like this (x vs y). In my case, the x is Mass and y is Ngal Mstar+2.
An example image of what I am looking for looks like this:
Here they have showed the 68% confidence region using dark blue and 95% confidence region using light blue.
Can it be achieved using one of thescipy.stats modules?

To plot a region between two curves, you could use pyplot.fill_between().
As for your confidence region, I was not sure what you wanted to achieve, so I exemplified with simultaneous confidence bands, by modifying the code from:
https://en.wikipedia.org/wiki/Confidence_and_prediction_bands#cite_note-2
import numpy as np
import matplotlib.pyplot as plt
import scipy.special as sp
## Sample size.
n = 50
## Predictor values.
XV = np.random.uniform(low=-4, high=4, size=n)
XV.sort()
## Design matrix.
X = np.ones((n,2))
X[:,1] = XV
## True coefficients.
beta = np.array([0, 1.], dtype=np.float64)
## True response values.
EY = np.dot(X, beta)
## Observed response values.
Y = EY + np.random.normal(size=n)*np.sqrt(20)
## Get the coefficient estimates.
u,s,vt = np.linalg.svd(X,0)
v = np.transpose(vt)
bhat = np.dot(v, np.dot(np.transpose(u), Y)/s)
## The fitted values.
Yhat = np.dot(X, bhat)
## The MSE and RMSE.
MSE = ((Y-EY)**2).sum()/(n-X.shape[1])
s = np.sqrt(MSE)
## These multipliers are used in constructing the intervals.
XtX = np.dot(np.transpose(X), X)
V = [np.dot(X[i,:], np.linalg.solve(XtX, X[i,:])) for i in range(n)]
V = np.array(V)
## The F quantile used in constructing the Scheffe interval.
QF = sp.fdtri(X.shape[1], n-X.shape[1], 0.95)
QF_2 = sp.fdtri(X.shape[1], n-X.shape[1], 0.68)
## The lower and upper bounds of the Scheffe band.
D = s*np.sqrt(X.shape[1]*QF*V)
LB,UB = Yhat-D,Yhat+D
D_2 = s*np.sqrt(X.shape[1]*QF_2*V)
LB_2,UB_2 = Yhat-D_2,Yhat+D_2
## Make the plot.
plt.clf()
plt.plot(XV, Y, 'o', ms=3, color='grey')
plt.hold(True)
a = plt.plot(XV, EY, '-', color='black', zorder = 4)
plt.fill_between(XV, LB_2, UB_2, where = UB_2 >= LB_2, facecolor='blue', alpha= 0.3, zorder = 0)
b = plt.plot(XV, LB_2, '-', color='blue', zorder=1)
plt.plot(XV, UB_2, '-', color='blue', zorder=1)
plt.fill_between(XV, LB, UB, where = UB >= LB, facecolor='blue', alpha= 0.3, zorder = 2)
b = plt.plot(XV, LB, '-', color='blue', zorder=3)
plt.plot(XV, UB, '-', color='blue', zorder=3)
d = plt.plot(XV, Yhat, '-', color='red',zorder=4)
plt.ylim([-8,8])
plt.xlim([-4,4])
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
The output looks like this:

First of all thank you #snake_charmer for your answer, but I have found a simpler way of solving the issue using curve_fit from scipy.optimize
I fit my data sample using curve_fit which gives me my best fit parameters. What it also gives me is the estimated covariance of the parameters. The diagonals of the same provide the variance of the parameter estimate. To compute one standard deviation errors on the parameters we can use np.sqrt(np.diag(pcov)) where pcov is the covariance matrix.
def fitfunc(M,p1,p2):
N = p1+( (M)*p2 )
return N
The above is the fit function I use for the data.
Now to fit the data using curve_fit
popt_1,pcov_1 = curve_fit(fitfunc,logx,logn,p0=(10.0,1.0),maxfev=2000)
p1_1 = popt_1[0]
p1_2 = popt_1[1]
sigma1 = [np.sqrt(pcov_1[0,0]),np.sqrt(pcov_1[1,1])] #THE 1 SIGMA CONFIDENCE INTERVALS
residuals1 = (logy) - fitfunc((logx),p1_1,p1_2)
xi_sq_1 = sum(residuals1**2) #THE CHI-SQUARE OF THE FIT
curve_y_1 = fitfunc((logx),p1_1,p1_2)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(logx,logy,c='r',label='$0.0<z<0.5$')
ax1.plot(logx,curve_y_1,'y')
ax1.plot(logx,fitfunc(logx,p1_1+sigma1[0],p1_2+sigma1[1]),'m',label='68% conf limits')
ax1.plot(logx,fitfunc(logx,p1_1-sigma1[0],p1_2-sigma1[1]),'m')
So just by using the square root the diagonal elements of the covariance matrix, I can obtain the 1 sigma confidence lines.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Using StatsModels to plot quantile regression for 2nd order polynomial - python

Related

Gaussian Fit function anomaly

Quantile regression for 2nd order polynomial using StatsModels

GP regression using Poisson likelihood

Calculating PDF given a histogram

Confidence regions of 1sigma for a 2D plot

Categories

Resources