Constrained regression in Python

Constrained regression in Python - python

I have this simple regression model:
y = a + b * x + c * z + error
with a constraint on parameters:
c = b - 1
There are similar questions posted on SO (like Constrained Linear Regression in Python). However, the constraints' type is lb <= parameter =< ub.
What are the available options to handle this specific constrained linear regression problem?

This is how it can be done using GLM:
import statsmodels
import statsmodels.api as sm
import numpy as np
# Set the link function to identity
statsmodels.genmod.families.links.identity()
OLS_from_GLM = sm.GLM(y, sm.add_constant(np.column_stack(x, z)))
'''Setting the restrictions on parameters in the form of (R, q), where R
and q are constraints' matrix and constraints' values, respectively. As
for the restriction in the aforementioned regression model, i.e.,
c = b - 1 or b - c = 1, R = [0, 1, -1] and q = 1.'''
res_OLS_from_GLM = OLS_from_GLM.fit_constrained(([0, 1.0, -1.0], 1))
print(res_OLS_from_GLM.summary())

There are a few constrained optimization packages in Python such as CVX, CASADI, GEKKO, Pyomo, and others that can solve the problem. I develop Gekko for linear, nonlinear, and mixed integer optimization problems with differential or algebraic constraints.
import numpy as np
from gekko import GEKKO
# Data
x = np.random.rand(10)
y = np.random.rand(10)
z = np.random.rand(10)
# Gekko for constrained regression
m = GEKKO(remote=False); m.options.IMODE=2
a,b,c = m.Array(m.FV,3)
a.STATUS=1; b.STATUS=1; c.STATUS=1
x=m.Param(x); z=m.Param(z)
y = m.Var(); ym=m.Param(y)
m.Equation(y==a+b*x+c*z)
m.Equation(c==b-1)
m.Minimize((ym-y)**2)
m.options.SOLVER=1
m.solve(disp=True)
print(a.value[0],b.value[0],c.value[0])
This gives the solution that may be different when you run it because it uses random values for the data.
-0.021514129645 0.45830726553 -0.54169273447
The constraint c = b - 1 is satisfied with -0.54169273447 = 0.45830726553 - 1. Here is a comparison to other linear regression packages in Python with an without constraints:
import numpy as np
from scipy.stats import linregress
import statsmodels.api as sm
import matplotlib.pyplot as plt
from gekko import GEKKO
# Data
x = np.array([4,5,2,3,-1,1,6,7])
y = np.array([0.3,0.8,-0.05,0.1,-0.8,-0.5,0.5,0.65])
# calculate R^2
def rsq(y1,y2):
yresid= y1 - y2
SSresid = np.sum(yresid**2)
SStotal = len(y1) * np.var(y1)
r2 = 1 - SSresid/SStotal
return r2
# Method 1: scipy linregress
slope,intercept,r,p_value,std_err = linregress(x,y)
a = [slope,intercept]
print('R^2 linregress = '+str(r**2))
# Method 2: numpy polyfit (1=linear)
a = np.polyfit(x,y,1); print(a)
yfit = np.polyval(a,x)
print('R^2 polyfit = '+str(rsq(y,yfit)))
# Method 3: numpy linalg solution
# y = X a
# X^T y = X^T X a
X = np.vstack((x,np.ones(len(x)))).T
# matrix operations
XX = np.dot(X.T,X)
XTy = np.dot(X.T,y)
a = np.linalg.solve(XX,XTy)
# same solution with lstsq
a = np.linalg.lstsq(X,y,rcond=None)[0]
yfit = a[0]*x+a[1]; print(a)
print('R^2 matrix = '+str(rsq(y,yfit)))
# Method 4: statsmodels ordinary least squares
X = sm.add_constant(x,prepend=False)
model = sm.OLS(y,X).fit()
yfit = model.predict(X)
a = model.params
print(model.summary())
# Method 5: Gekko for constrained regression
m = GEKKO(remote=False); m.options.IMODE=2
c = m.Array(m.FV,2); c[0].STATUS=1; c[1].STATUS=1
c[1].lower=-0.5
xd = m.Param(x); yd = m.Param(y); yp = m.Var()
m.Equation(yp==c[0]*xd+c[1])
m.Minimize((yd-yp)**2)
m.solve(disp=False)
c = [c[0].value[0],c[1].value[1]]
print(c)
# plot data and regressed line
plt.plot(x,y,'ko',label='data')
xp = np.linspace(-2,8,100)
slope = str(np.round(a[0],2))
intercept = str(np.round(a[1],2))
eqn = 'LstSQ: y='+slope+'x'+intercept
plt.plot(xp,a[0]*xp+a[1],'r-',label=eqn)
slope = str(np.round(c[0],2))
intercept = str(np.round(c[1],2))
eqn = 'Constraint: y='+slope+'x'+intercept
plt.plot(xp,c[0]*xp+c[1],'b--',label=eqn)
plt.grid()
plt.legend()
plt.show()

Related

Piecewise Linear Regression failure using curve-fit and lmfit

I am trying to use curve fitting to find coefficients for an equation using multiple datasets. The equation itself is piecewise, it is defined as :
In this equation, we don't know the break point Po. The variable
I have tried using scipy curve_fit and lmfit. Curve_fit succefully fitted the data for some datasets but failed miserably in others. Here is the code for lmfit inspired by this answer and Curve_fit inspired by this answer:
import pandas as pd
import matplotlib
from scipy.signal import savgol_filter
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.transforms as mtrans
from sklearn import linear_model
import csv
from scipy import stats
from sklearn import preprocessing
from scipy.special import erf,erfc
from lmfit import minimize, Parameters, Model
from sklearn.linear_model import LinearRegression
power_level_for_prediction = [45,50,60,69,71,88]
group_by_column = "mem_pow"
critical_device_power_name = "core_pow"
files = pd.read_csv("file_path")
def residual(params,x,y = None):
param1 = params['a']
param2 = params['b']
param3 = params['x0']
param4 = params['c']
param5 = params['d']
dx = (max(x) - min(x))/(len(x) -1)
xhi = (erf((x-param3)/dx) + 1)/2.0
xlo = (erfc((x-param3)/dx) + 1)/2.0
# p = xlo*param4*np.exp(param5*x) + xhi*(param1*x+param2)
p = xlo*(param1*x + param2) + xhi*(param4*x + param5)
# p = param1*x + param2
# p[np.where(param2 < x)] = param3*x + param2
if y is None:
return p
return p - y
def linear_lmfit(x,y):
params = Parameters()
params.add('a', value = 0.1)
params.add('b', value = 0.2)
params.add('c', value = 0.3)
params.add('d', value = 0.4,min = -5, max =5)
params.add('x0', value = 120)
out = minimize(residual,params,args = (x,y))
fit = residual(out.params,x)
return fit
def piecewise_linear(x, x0, y0, a, c):
# Represntation of above equation. here b and d from above equation, would remain same.
return np.piecewise(x, [x< x0],[lambda x: a*x + y0-a*x0, lambda x: c*x + y0-c*x0])
def linear(files):
files_grouped = files.groupby(group_by_column)
rows, columns = (2,3)
fig, ax = plt.subplots(rows,columns,figsize = (20,10))
k = 0
for name, group in files_grouped:
x = group[critical_device_power_name].to_numpy().astype(float)
y = group['elapsed_time'].to_numpy().astype(float)
if name in power_level_for_prediction:
i = math.floor( k / columns)
j = k % columns
p ,e = curve_fit(piecewise_linear,x,y)
#pred = piecewise_linear(x,*p)
pred = linear_lmfit(x,y)
ax[i][j].plot(x,y,label = "Actual Elapsed Time")
ax[i][j].plot(x,pred, label = "Predicted Elapsed Time")
ax[i][j].grid()
ax[i][j].set_title(f"Prediction Result for {name}W {group_by_column}")
ax[i][j].set_ylabel(r"$T_c$ (sec)")
ax[i][j].set_xlabel(f"{critical_device_power_name}")
ax[i][j].legend(title = f'{group_by_column}')
k = k+1
fig.suptitle(f"{experiment_name}")
fig.tight_layout()
plt.show()
Result using LMFIT:
I have no clue, why LMFIT is showing this type of result. Do you think is it because of the intial value.
and here is the result for the curve_fit:
As seen in the graph, for some mem_pow values the graph is somewhat good but for other it is quite bad. I am unable to understand the reason behind this. In my opinion, the curve fitting is failling for mem_pow level because the second piecwise function is quite flat and the function fails to fit that part.
Here is the csv file :
https://gist.github.com/kulnaman/8952e9c14ec5e8dcf2bbbd40f2dccdaa

NumPyro: sampling active sites as Bernoulli RVs

I want to modify the following NumPyro model:
import jax.numpy as jnp
from jax import random, vmap
import numpy as np
import numpyro
import numpyro.distributions as dist
from numpyro.infer import MCMC, NUTS
numpyro.set_host_device_count(6)
def model(y=None, X=None):
n_predictors = X.shape[1]
with numpyro.plate('state', n_predictors):
theta = numpyro.sample('theta', dist.Gamma(concentration=1, rate=1/5000))
mu = jnp.dot(X, theta)
numpyro.sample('y', dist.Normal(loc=mu, scale=1), obs=y)
theta = np.zeros(5) # True parameters
theta[0] = 2
theta[1] = 3
X = np.random.randn(20, theta.size)**2 # Design matrix
y = X # theta + np.random.randn(X.shape[0]) # data
rng_key = random.PRNGKey(74674)
rng_key, rng_key_ = random.split(rng_key)
mcmc = MCMC(NUTS(model), num_warmup=500, num_samples=1000, num_chains=6)
mcmc.run(rng_key_, X=X, y=y)
mcmc.print_summary()
I want to include Bernoulli RVs z that choose which theta is active. Then I would like to make inference for these z. Basically, I am trying to do variable selection. The idea is illustrated in the following model (which fails):
def failed_model(y=None, X=None):
n_predictors = X.shape[1]
with numpyro.plate('state', n_predictors):
theta = numpyro.sample('theta', dist.Gamma(concentration=1, rate=1/5000))
z = numpyro.sample('z', dist.Bernoulli(0.1)
mu = jnp.dot(X, theta * z)
numpyro.sample('y', dist.Normal(loc=mu, scale=1), obs=y)
I tried to understand the second example from the [docs][1], but it does not show masking for a random array, but rather for a fixed array.
[1]: https://num.pyro.ai/en/stable/distributions.html

How do you test the significance of regression estimated parameters (fitting data)?

I made a regression model that tries to fit my data (x: year, y: number of cars). And now I feel frustrated. How to assess if the estimated parameters (p = 0.0001695867, q = 0.349592505) are significant? How to perform some statistical tests (estimate p-values for both p and q, t-statistics) to test the significance of p and q. And maybe an F-test of overall significance in regression analysis. For some reason, I'm not interested in finding confidence intervals for p and q. But p-values or t-statistics or whatever are of more interest for me to calculate. So that
Ho : p statistically insignificant H1 : p statistically significant. Same for q.
And an F-test:
Ho: p & q = 0 at the same time. H1: either p or q doesn't equal 0
import pandas as pd
x = pd.read_excel('fitting_data.xlsx', sheet_name="bevshyb cars (2)", index_col=None, dtype={'Name': str, 'Value': float})
import numpy as np
#regression function
def fit(t,p,q):
return 22500000*(((p*p*p+2*p*p*q+p*q*q)*np.exp(-p*t-q*t))/(((p+q*np.exp(-p*t-q*t))*(p+q*np.exp(-p*t-q*t)))))
#initial values
g = [0.000001,0.000001]
import scipy.optimize
t = x['t'].values
carsfact = x['BEVSHYB'].values
c, cov = scipy.optimize.curve_fit(fit,t,carsfact,g)
print(round(c[0],10))
print(round(c[1],10))
Estimated parameters: p & q respectively == 0.0001695867, 0.349592505
import sklearn.metrics
print('R^2: ',sklearn.metrics.r2_score(x['BEVSHYB'],y))
print('explained_variance_score: ', sklearn.metrics.explained_variance_score(x['BEVSHYB'], y))
Assessing goodness-of-fit in the regression model:
R^2: 0.9143477744061798
explained_variance_score: 0.9168457427666166
Will appreciate any help)))

Please, consult the answer to the question posted in this link: it shows one way of assessing the significance of the optimized parameters:
https://stats.stackexchange.com/questions/362520/how-to-know-if-a-parameter-is-statistically-significant-in-a-curve-fit-estimat
Here's the sample code featured over there; note the usage of scipy.stats:
from scipy.optimize import curve_fit
import numpy as np
import scipy.odr
import scipy.stats
x = np.array([5.357, 5.797, 5.936, 6.161, 6.697, 6.731, 6.775, 8.442, 9.861])
y = np.array([0.376, 0.874, 1.049, 1.327, 2.054, 2.077, 2.138, 4.744, 7.104])
def f(x,b0,b1):
return b0 + (b1 * x)
def f_wrapper_for_odr(beta, x): # parameter order for odr
return f(x, *beta)
parameters, cov= curve_fit(f, x, y)
model = scipy.odr.odrpack.Model(f_wrapper_for_odr)
data = scipy.odr.odrpack.Data(x,y)
myodr = scipy.odr.odrpack.ODR(data, model, beta0=parameters, maxit=0)
myodr.set_job(fit_type=2)
parameterStatistics = myodr.run()
df_e = len(x) - len(parameters) # degrees of freedom, error
cov_beta = parameterStatistics.cov_beta # parameter covariance matrix from ODR
sd_beta = parameterStatistics.sd_beta * parameterStatistics.sd_beta
ci = []
t_df = scipy.stats.t.ppf(0.975, df_e)
ci = []
for i in range(len(parameters)):
ci.append([parameters[i] - t_df * parameterStatistics.sd_beta[i], parameters[i] + t_df * parameterStatistics.sd_beta[i]])
tstat_beta = parameters / parameterStatistics.sd_beta # coeff t-statistics
pstat_beta = (1.0 - scipy.stats.t.cdf(np.abs(tstat_beta), df_e)) * 2.0 # coef. p-values
for i in range(len(parameters)):
print('parameter:', parameters[i])
print(' conf interval:', ci[i][0], ci[i][1])
print(' tstat:', tstat_beta[i])
print(' pstat:', pstat_beta[i])
print()
```

associativity of matrix multiplication in numpy

I am playing with a simple numpy example and having hard time to understand why associative property of matrix multiplication
ABC = (AB)C = A(BC)
does not exactly hold. I assume the problem is with numeric stability. But how to address it? What is the issue exactly?
Here is my example with linear regression. I use sklearn solution as it gives more divergence between associative groupings:
import numpy as np
np.random.seed(42)
num_samples = 100
M = 1000
sigma = 0.5
X = np.random.binomial(2, 0.4, (num_samples, M))
beta = np.zeros(M)
beta[5] = 1.0
y = X.dot(beta) + sigma*np.random.randn(num_samples)
"standardise y"
y = y - np.mean(y)
y = y/np.std(y)
"center and standardise X"
Xc = X - X.mean(axis=0)
xstd = X.std(axis=0)
mask = xstd > 1e-12
Xc = Xc[:, mask]
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(Xc ,y)
beta_hat_sklearn = lr.coef_
beta_hat_sklearn.T # Xc.T # Xc # beta_hat_sklearn / num_samples
"equivalent < Python3.5"
beta_hat_sklearn.T.dot(Xc.T).dot(Xc).dot(beta_hat_sklearn) / num_samples
# 1.0000000000000009
beta_hat_sklearn.T # (Xc.T # Xc) # beta_hat_sklearn / num_samples
"equivalent < Python3.5"
beta_hat_sklearn.T.dot(Xc.T.dot( Xc )).dot(beta_hat_sklearn )/ num_samples
# 0.89517439485479278
Update
It might be MacOSX specific bug.

Understanding scipy's least square function with IRLS

I'm having a bit of trouble understanding how this function works.
a, b = scipy.linalg.lstsq(X, w*signal)[0]
I know that signal is the array representing the signal and currently w is just [1,1,1,1,1...]
How should I manipulate X or w to imitate weighted least squares or iteratively reweighted least squared?

If you product X and y with sqrt(weight) you can calculate weighted least squares.
You can get the formula by following link:
http://en.wikipedia.org/wiki/Linear_least_squares_%28mathematics%29#Weighted_linear_least_squares
here is an example:
Prepare data:
import numpy as np
np.random.seed(0)
N = 20
X = np.random.rand(N, 3)
w = np.array([1.0, 2.0, 3.0])
y = np.dot(X, w) + np.random.rand(N) * 0.1
OLS:
from scipy import linalg
w1 = linalg.lstsq(X, y)[0]
print w1
output:
[ 0.98561405 2.0275357 3.05930664]
WLS:
weights = np.linspace(1, 2, N)
Xw = X * np.sqrt(weights)[:, None]
yw = y * np.sqrt(weights)
print linalg.lstsq(Xw, yw)[0]
output:
[ 0.98799029 2.02599521 3.0623824 ]
Check result by statsmodels:
import statsmodels.api as sm
mod_wls = sm.WLS(y, X, weights=weights)
res = mod_wls.fit()
print res.params
output:
[ 0.98799029 2.02599521 3.0623824 ]

Create a diagonal matrix W from the elementwise square-roots of w. Then I think you just want:
scipy.linalg.lstsq(np.dot(W, X), np.dot(W*signal))
Following http://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)#Weighted_linear_least_squares

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Constrained regression in Python - python

Related

Piecewise Linear Regression failure using curve-fit and lmfit

NumPyro: sampling active sites as Bernoulli RVs

How do you test the significance of regression estimated parameters (fitting data)?

associativity of matrix multiplication in numpy

Understanding scipy's least square function with IRLS

Categories

Resources