I want to modify the following NumPyro model:
import jax.numpy as jnp
from jax import random, vmap
import numpy as np
import numpyro
import numpyro.distributions as dist
from numpyro.infer import MCMC, NUTS
def model(y=None, X=None):
n_predictors = X.shape[1]
with numpyro.plate('state', n_predictors):
theta = numpyro.sample('theta', dist.Gamma(concentration=1, rate=1/5000))
mu =, theta)
numpyro.sample('y', dist.Normal(loc=mu, scale=1), obs=y)
theta = np.zeros(5) # True parameters
theta[0] = 2
theta[1] = 3
X = np.random.randn(20, theta.size)**2 # Design matrix
y = X # theta + np.random.randn(X.shape[0]) # data
rng_key = random.PRNGKey(74674)
rng_key, rng_key_ = random.split(rng_key)
mcmc = MCMC(NUTS(model), num_warmup=500, num_samples=1000, num_chains=6), X=X, y=y)
I want to include Bernoulli RVs z that choose which theta is active. Then I would like to make inference for these z. Basically, I am trying to do variable selection. The idea is illustrated in the following model (which fails):
def failed_model(y=None, X=None):
n_predictors = X.shape[1]
with numpyro.plate('state', n_predictors):
theta = numpyro.sample('theta', dist.Gamma(concentration=1, rate=1/5000))
z = numpyro.sample('z', dist.Bernoulli(0.1)
mu =, theta * z)
numpyro.sample('y', dist.Normal(loc=mu, scale=1), obs=y)
I tried to understand the second example from the [docs][1], but it does not show masking for a random array, but rather for a fixed array.
I want to use the Gaussian Process approximation for a simple 1D test function to illustrate a few things. I want to iterate over a few different values for the correlation matrix (since this is 1D it is just a single value) and show what effect different values have on the approximation. My understanding is, that "theta" is the parameter for this. Therefore I want to set the theta value manually and don't want any optimization/changes to it. I thought the constant kernel and the clone_with_theta function might get me what I want but I didn't get it to work. Here is what I have so far:
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as ConstantKernel
def f(x):
"""The function to predict."""
return x/2 + ((1/10 + x) * np.sin(5*x - 1))/(1 + x**2 * (np.sin(x - (1/2))**2))
# ----------------------------------------------------------------------
# Data Points
X = np.atleast_2d(np.delete(np.linspace(-1,1, 7),4)).T
y = f(X).ravel()
# Instantiate a Gaussian Process model
kernel = ConstantKernel(constant_value=1, constant_value_bounds='fixed')
theta = np.array([0.5,0.5])
kernel = kernel.clone_with_theta(theta)
gp = GaussianProcessRegressor(kernel=kernel, optimizer=None)
# Fit to data using Maximum Likelihood Estimation of the parameters, y)
# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred, sigma = gp.predict(x, return_std=True)
# Plot
# ...
I programmed a simple implementation myself now, which allows to set correlation (here 'b') manually:
import numpy as np
from numpy.linalg import inv
def f(x):
"""The function to predict."""
return x/2 + ((1/10 + x) * np.sin(5*x - 1))/(1 + x**2 * (np.sin(x - (1/2))**2))
def kriging_approx(x,xt,yt,b,mu,R_inv):
N = yt.size
one = np.matrix(np.ones((yt.size))).T
r = np.zeros((N))
for i in range(0,N):
r[i]= np.exp(-b * (xt[i]-x)**2)
y = mu + np.matmul(np.matmul(r.T,R_inv),yt - mu*one)
y = y[0,0]
return y
def calc_R (x,b):
N = x.size
# setup R
R = np.zeros((N,N))
for i in range(0,N):
for j in range(0,N):
R[i][j] = np.exp(-b * (x[i]-x[j])**2)
R_inv = inv(R)
return R, R_inv
def calc_mu_sig (yt, R_inv):
N = yt.size
one = np.matrix(np.ones((N))).T
mu = np.matmul(np.matmul(one.T,R_inv),yt) / np.matmul(np.matmul(one.T,R_inv),one)
mu = mu[0,0]
sig2 = (np.matmul(np.matmul((yt - mu*one).T,R_inv),yt - mu*one))/(N)
sig2 = sig2[0,0]
return mu, sig2
# ----------------------------------------------------------------------
# Data Points
xt = np.linspace(-1,1, 7)
yt = np.matrix((f(xt))).T
# Calc R
R, R_inv = calc_R(xt, b)
# Calc mu and sigma
mu_dach, sig_dach2 = calc_mu_sig(yt, R_inv)
# Point to get approximation for
x = 1
y_approx = kriging_approx(x, xt, yt, b, mu_dach, R_inv)
I'm attempting to create a simple linear model with Python using no libraries (other than numpy). Here's what I have
import numpy as np
import pandas
alpha = 0.1
def h(x, w):
return, x)
def cost(X, W, Y):
totalCost = 0
for i in range(47):
diff = h(X[i], W) - Y[i]
squared = diff * diff
totalCost += squared
return totalCost / 2
housing_data = np.loadtxt('Housing.csv', delimiter=',')
x1 = housing_data[:,0]
x2 = housing_data[:,1]
y = housing_data[:,2]
avgX1 = np.mean(x1)
stdX1 = np.std(x1)
normX1 = (x1 - avgX1) / stdX1
print('avgX1', avgX1)
print('stdX1', stdX1)
avgX2 = np.mean(x2)
stdX2 = np.std(x2)
normX2 = (x2 - avgX2) / stdX2
print('avgX2', avgX2)
print('stdX2', stdX2)
normalizedX = np.ones((47, 3))
normalizedX[:,1] = normX1
normalizedX[:,2] = normX2
np.savetxt('normalizedX.csv', normalizedX)
weights = np.ones((3,))
for boom in range(100):
currentCost = cost(normalizedX, weights, y)
if boom % 1 == 0:
print(boom, 'iteration', weights[0], weights[1], weights[2])
print('Cost', currentCost)
for i in range(47):
errorDiff = h(normalizedX[i], weights) - y[i]
weights[0] = weights[0] - alpha * (errorDiff) * normalizedX[i][0]
weights[1] = weights[1] - alpha * (errorDiff) * normalizedX[i][1]
weights[2] = weights[2] - alpha * (errorDiff) * normalizedX[i][2]
predictedX = [1, (2100 - avgX1) / stdX1, (3 - avgX2) / stdX2]
firstPrediction = np.array(predictedX)
print('firstPrediction', firstPrediction)
firstPrediction = h(firstPrediction, weights)
First, it converges VERY quickly. After only 14 iterations. Second, it gives me a different result than a linear regression with sklearn. For reference, my sklearn code is:
import numpy
import matplotlib.pyplot as plot
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
dataset = pandas.read_csv('Housing.csv', header=None)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 2].values
linearRegressor = LinearRegression()
xnorm = sklearn.preprocessing.scale(x)
scaleCoef = sklearn.preprocessing.StandardScaler().fit(x)
mean = scaleCoef.mean_
std = numpy.sqrt(scaleCoef.var_)
stuff =, y)
predictedX = [[(2100 - mean[0]) / std[0], (3 - mean[1]) / std[1]]]
yPrediction = linearRegressor.predict(predictedX)
print('predictedX', predictedX)
print('predict', yPrediction)
print(stuff.coef_, stuff.intercept_)
My custom model predicts 337,000 for the value of y and sklearn predicts 355,000. My data is 47 rows that look like
Complete data available at
I assume either (a) my regression with gradient descent is somehow wrong or (b) I'm not using sklearn properly.
Any other reasons why the 2 wouldn't predict the same output for a given input?
I think you are missing the 1/m term (where m is the size of y) in the gradient descent. After including the 1/m term, I seem to get a predicted value similar to your sklearn code.
see below
weights = np.ones((3,))
m = y.size
for boom in range(100):
currentCost = cost(normalizedX, weights, y)
if boom % 1 == 0:
print(boom, 'iteration', weights[0], weights[1], weights[2])
print('Cost', currentCost)
for i in range(47):
errorDiff = h(normalizedX[i], weights) - y[i]
weights[0] = weights[0] - alpha *(1/m)* (errorDiff) * normalizedX[i][0]
weights[1] = weights[1] - alpha *(1/m)* (errorDiff) * normalizedX[i][1]
weights[2] = weights[2] - alpha *(1/m)* (errorDiff) * normalizedX[i][2]
this gives the firstprediction to be 355242.
This agrees well with the linear regression model even though it does not do gradient descent.
I also tried sgdregressor (uses stochastic gradient descent) in sklearn and it too seem to get a value close to linear regressor model and your model. see the code below
import numpy
import matplotlib.pyplot as plot
import pandas
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
dataset = pandas.read_csv('Housing.csv', header=None)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 2].values
sgdRegressor = SGDRegressor(penalty='none', learning_rate='constant', eta0=0.1, max_iter=1000, tol = 1E-6)
xnorm = sklearn.preprocessing.scale(x)
scaleCoef = sklearn.preprocessing.StandardScaler().fit(x)
mean = scaleCoef.mean_
std = numpy.sqrt(scaleCoef.var_)
yPrediction = []
predictedX = [[(2100 - mean[0]) / std[0], (3 - mean[1]) / std[1]]]
print('predictedX', predictedX)
for trials in range(10):
stuff =, y)
print('predict', np.mean(yPrediction))
results in
predict 355533.10119985335
I've written some beginner code to calculate the co-efficients of a simple linear model using the normal equation.
# Modules
import numpy as np
# Loading data set
X, y = np.loadtxt('ex1data3.txt', delimiter=',', unpack=True)
data = np.genfromtxt('ex1data3.txt', delimiter=',')
def normalEquation(X, y):
m = int(np.size(data[:, 1]))
# This is the feature / parameter (2x2) vector that will
# contain my minimized values
theta = []
# I create a bias_vector to add to my newly created X vector
bias_vector = np.ones((m, 1))
# I need to reshape my original X(m,) vector so that I can
# manipulate it with my bias_vector; they need to share the same
# dimensions.
X = np.reshape(X, (m, 1))
# I combine these two vectors together to get a (m, 2) matrix
X = np.append(bias_vector, X, axis=1)
# Normal Equation:
# theta = inv(X^T * X) * X^T * y
# For convenience I create a new, tranposed X matrix
X_transpose = np.transpose(X)
# Calculating theta
theta = np.linalg.inv(
theta =
theta =
return theta
p = normalEquation(X, y)
Using the small data set found here:
I get the co-efficients: [-0.34390603; 0.2124426 ] using the above code instead of: [24.9660; 3.3058]. Could anyone help clarify where I am going wrong?
You can implement normal equation like below:
import numpy as np
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance
theta_best = np.linalg.inv(
X_new = np.array([[0], [2]])
X_new_b = np.c_[np.ones((2, 1)), X_new] # add x0 = 1 to each instance
y_predict =
This assumes X is an m by n+1 dimensional matrix where x_0 always = 1 and y is a m-dimensional vector.
import numpy as np
step1 =, X)
step2 = np.linalg.pinv(step1)
step3 =, X.T)
theta =, y) # if y is m x 1. If 1xm, then use y.T
Your implementation is correct. You've only swapped X and y (look closely how they define x and y), that's why you get a different result.
The call normalEquation(y, X) gives [ 24.96601443 3.30576144] as it should.
Here is the normal equation in one line:
theta =,X)),,Y))
I have this simple regression model:
y = a + b * x + c * z + error
with a constraint on parameters:
c = b - 1
There are similar questions posted on SO (like Constrained Linear Regression in Python). However, the constraints' type is lb <= parameter =< ub.
What are the available options to handle this specific constrained linear regression problem?
This is how it can be done using GLM:
import statsmodels
import statsmodels.api as sm
import numpy as np
# Set the link function to identity
OLS_from_GLM = sm.GLM(y, sm.add_constant(np.column_stack(x, z)))
'''Setting the restrictions on parameters in the form of (R, q), where R
and q are constraints' matrix and constraints' values, respectively. As
for the restriction in the aforementioned regression model, i.e.,
c = b - 1 or b - c = 1, R = [0, 1, -1] and q = 1.'''
res_OLS_from_GLM = OLS_from_GLM.fit_constrained(([0, 1.0, -1.0], 1))
There are a few constrained optimization packages in Python such as CVX, CASADI, GEKKO, Pyomo, and others that can solve the problem. I develop Gekko for linear, nonlinear, and mixed integer optimization problems with differential or algebraic constraints.
import numpy as np
from gekko import GEKKO
# Data
x = np.random.rand(10)
y = np.random.rand(10)
z = np.random.rand(10)
# Gekko for constrained regression
m = GEKKO(remote=False); m.options.IMODE=2
a,b,c = m.Array(m.FV,3)
x=m.Param(x); z=m.Param(z)
y = m.Var(); ym=m.Param(y)
This gives the solution that may be different when you run it because it uses random values for the data.
-0.021514129645 0.45830726553 -0.54169273447
The constraint c = b - 1 is satisfied with -0.54169273447 = 0.45830726553 - 1. Here is a comparison to other linear regression packages in Python with an without constraints:
import numpy as np
from scipy.stats import linregress
import statsmodels.api as sm
import matplotlib.pyplot as plt
from gekko import GEKKO
# Data
x = np.array([4,5,2,3,-1,1,6,7])
y = np.array([0.3,0.8,-0.05,0.1,-0.8,-0.5,0.5,0.65])
# calculate R^2
def rsq(y1,y2):
yresid= y1 - y2
SSresid = np.sum(yresid**2)
SStotal = len(y1) * np.var(y1)
r2 = 1 - SSresid/SStotal
return r2
# Method 1: scipy linregress
slope,intercept,r,p_value,std_err = linregress(x,y)
a = [slope,intercept]
print('R^2 linregress = '+str(r**2))
# Method 2: numpy polyfit (1=linear)
a = np.polyfit(x,y,1); print(a)
yfit = np.polyval(a,x)
print('R^2 polyfit = '+str(rsq(y,yfit)))
# Method 3: numpy linalg solution
# y = X a
# X^T y = X^T X a
X = np.vstack((x,np.ones(len(x)))).T
# matrix operations
XX =,X)
XTy =,y)
a = np.linalg.solve(XX,XTy)
# same solution with lstsq
a = np.linalg.lstsq(X,y,rcond=None)[0]
yfit = a[0]*x+a[1]; print(a)
print('R^2 matrix = '+str(rsq(y,yfit)))
# Method 4: statsmodels ordinary least squares
X = sm.add_constant(x,prepend=False)
model = sm.OLS(y,X).fit()
yfit = model.predict(X)
a = model.params
# Method 5: Gekko for constrained regression
m = GEKKO(remote=False); m.options.IMODE=2
c = m.Array(m.FV,2); c[0].STATUS=1; c[1].STATUS=1
xd = m.Param(x); yd = m.Param(y); yp = m.Var()
c = [c[0].value[0],c[1].value[1]]
# plot data and regressed line
xp = np.linspace(-2,8,100)
slope = str(np.round(a[0],2))
intercept = str(np.round(a[1],2))
eqn = 'LstSQ: y='+slope+'x'+intercept
slope = str(np.round(c[0],2))
intercept = str(np.round(c[1],2))
eqn = 'Constraint: y='+slope+'x'+intercept