I'm trying to optimize the marginal likelihood to estimate parameters for a Gaussian process regression.
So i defined the marginal log likelihood this way:
def marglike(par,X,Y):
l,sigma_n = par
n = len(X)
dist_X = (X.T - X)**2
k = np.exp(-(1/(2*(l**2)))*dist_X)
inverse = np.linalg.inv(k + (sigma_n**2)*np.eye(len(k)))
ml = (1/2)*np.dot(np.dot(Y.T,inverse),Y) + (1/2)*np.log(np.linalg.det(k + (sigma_n**2)*np.eye(len(k)))) + (n/2)*np.log(2*np.pi)
return ml
Where the parameter to be optimized are " l " and " sigma_n ".
With some initial values and data, the function give some value back:
X = np.linspace(1,10,20)
F = np.sin(X)
start = np.array([1,0.05]) #initial parameters values
marglike(start,X,F)
marglike(start,X,F)
Out[75]: array([[1872.6511786]])
But when i try to optimize the parameters with " minimize ", i get this:
re = minimize(marglike,start,args=(X,F),method="BFGS",options = {'disp':True})
re = minimize(marglike,start,args=(X,F),method="BFGS",options = {'disp':True})
Optimization terminated successfully.
Current function value: 22.863446
Iterations: 8
Function evaluations: 60
Gradient evaluations: 15
re.x
Out[89]: array([1. , 0.70845989])
I dont know why but the parameter "l" don't seem to be optimized, but it matches the starting value that i fixed.
Any suggest ?
You need to reshape X to 2d first for X.T-X to work. Also, you need to add one more parameter called variance (var in the code below) in optimization. Let me know if the below code solves your problem.
from scipy.optimize import minimize
def marglike(par,X,Y):
# print(par)
l,var,sigma_n = par
n = len(X)
dist_X = (X - X.T)**2
# print(dist_X)
k = var*np.exp(-(1/(2*(l**2)))*dist_X)
inverse = np.linalg.inv(k + (sigma_n**2)*np.eye(len(k)))
ml = (1/2)*np.dot(np.dot(Y.T,inverse),Y) + (1/2)*np.log(np.linalg.det(k + (sigma_n**2)*np.eye(len(k)))) + (n/2)*np.log(2*np.pi)
return ml
X = np.linspace(1,10,20).reshape(-1,1) # Reshaping
F = np.sin(X)
start = np.array([1.1,1.6,0.05]) #initial parameters values
print(marglike(start,X,F))
re = minimize(marglike,start,args=(X,F),method="L-BFGS-B",options = {'disp':True})
re.x
Related
I have the following code where I have implemented gradient descent for a function using pyTorch. How do I add noise to the code so that it identifies both local minima?
import torch
startVal = -5.0
alpha = 0.001
space = " "
progressionCheck = True
x = torch.tensor(startVal, requires_grad=True)
def function(a):
f = a**4 - a**3 - a**2 + a - 1
return f
for i in range(1000):
function(x).backward()
newVal = x - alpha * (x.grad)
progressionCheck = function(newVal) < function(startVal)
x = newVal.detach().clone().requires_grad_()
print(x)
print("The minimum value occurs at" + space + str(float(x)))
print("The minimum value is" + space + str(function(float(x))))
I assume you intend to disturb the gradients by some noise. To do so, you could specify a distribution e.g. as follows
low, high = -0.1, 0.1
dist = torch.distributions.uniform.Uniform(low, high)
and then sample from it to update the gradients, i.e. adjust
newVal = x - alpha * (x.grad)
to
newVal = x - alpha * (x.grad) * dist.sample([1]).item()
Altneratively, sample the noise in advance
noise = dist.sample([1000])
and then index it
newVal = x - alpha * (x.grad) * noise[i]
However, I doubt this fulfils the purpose and don't see how you could avoid multiple runs coupled with using varying start values (or, less beautiful, very large noise or step size) to find multiple local minima.
I am trying to minimize the average root mean squared error of the following form in a class file using Gekko:
objective = np.sqrt((np.sum((ym-np.array(y))**2))/N/
(np.sum((ym-np.mean(ym))**2))/N)
Here is the code:
# Code
from math import ceil
import numpy as np
import pandas as pd
import os
from gekko import GEKKO
import sys
from demandlib.tools import add_weekdays2df
import matplotlib.pyplot as plt
class HeatBuilding_Personalized:
def __init__(self, df_index, **kwargs):
self.datapath = kwargs.get(
'datapath', os.path.join(os.path.dirname(__file__), 'bdew_data'))
self.df = pd.DataFrame(index=df_index)
self.df = add_weekdays2df(self.df, holiday_is_sunday=True,
holidays=kwargs.get('holidays'))
self.df['hour'] = self.df.index.hour + 1 # hour of the day
self.temperature = kwargs.get('temperature')
self.annual_heat_demand = kwargs.get('annual_heat_demand')
self.shlp_type = kwargs.get('shlp_type').upper()
self.wind_class = kwargs.get('wind_class')
self.building_class = kwargs.get('building_class', 0)
self.ww_incl = kwargs.get('ww_incl', True)
self.name = kwargs.get('name', self.shlp_type)
self.data_points = kwargs.get('data_points')
self.st_p= kwargs.get('st_p')
self.end= kwargs.get('end')
def get_bdew_profile(self):
""" Calculation of the normalized hourly heat demand
"""
self.df['temperature'] = self.temperature.values
self.df['temperature_geo'] = self.weighted_temperature(how='geometric_series')
sf = self.get_sf_values()
f = self.get_weekday_parameters()
# measurements
self.df['data_points']=self.data_points.values
self.df= self.df[self.st_p:self.end]
self.df=self.df.dropna()
self.annual_heat_demand= self.df['data_points'].sum()
self.temperature= pd.DataFrame(self.df['temperature'])
print(self.df)
ym = pd.DataFrame(self.df['data_points'])
print("amount of nan",str(ym.isnull().sum()))
ymeas_mean = np.mean(ym)
print(ym)
print('ymeas_mean:',ymeas_mean)
x1= np.array(self.df['temperature_geo'])
x2= np.array(self.get_weekday_parameters())
x3= np.int(self.annual_heat_demand)
x4= np.array(self.get_sf_values())
ym= np.array(ym)
# GEKKO model
m = GEKKO(remote=False)
a = m.FV( 3.7,lb=1,ub=4)
a.STATUS=1
b = m.FV(-35.1,lb=-40,ub=-30)
b.STATUS=1
c = m.FV(7.1,lb=5,ub=9)
c.STATUS=1
d = m.FV( 0.9,lb=0.1,ub=1.5)
d.STATUS=1
# variables
T_g= m.Param(value=x1)
f=m.Param(value=x2)
annual_demand=m.Param(value=x3)
sf=m.Param(value=x4)
ymeas = m.Param(value=ym)
N = len(ym)
print('index n:',N)
yest = m.CV(value=0)
yest.FSTATUS=1
# y=m.Var() # I am defining my state variabel
# y = m.Var()
# z.FSTATUS=1
# regression equation
k = m.Intermediate((a / (1 + (b / (T_g - 40)) ** c) + d))
s=m.Intermediate(np.sum(k*f))
kw=m.Intermediate( 1.0 / (s / 24))
m.Equation(yest == (k* kw * f * sf) * annual_demand)
# objectives
# m.Minimize(((yest-ymeas)/ymeas)**2)
m.Obj(m.sqrt((np.sum((ymeas-yest)**2))/N/(np.sum((ymeas-np.mean(ymeas))**2))/N))
# print('Obj init value = ' + str(object_af.value))
# regression mode
m.options.IMODE = 2
m.options.SOLVER = 1 # considering APOPT solver for 1 and IPOPT for 3
# optimize
m.options.MAX_ITER = 20
m.options.OTOL = 1.0e-10
m.options.RTOL = 1.0e-10
m.solve(disp=True)
# print parameters
# print('Obj after value = ' + str(vd.value))
print('Optimized, a = ' + str(a.value[0]))
print('Optimized, b = ' + str(b.value[0]))
print('Optimized, c = ' + str(c.value[0]))
print('Optimized, d = ' + str(d.value[0]))
# print('Optimized, h = ' + str(h.value))
# sys.exit()
print("optimization is ok")
sf = self.get_sf_values()
f = self.get_weekday_parameters()
h = (a.value[0] / (1 + (b.value[0] / (self.df['temperature_geo'] - 40)) ** c.value[0]) + d.value[0])
kw = 1.0 / (sum(h * f) / 24) #1.0 instead of annual heat demand because the #annual heat demand is already multiplied in get_bdew_profile and divide by 24 to get #daily value
y = (kw * h * f * sf) * self.annual_heat_demand
objective= np.sqrt((np.sum((ym-np.array(y))**2))/N/(np.sum((ym-np.mean(ym))**2))/N)
print('objective calculated without Gekko:',objective)
return y
It returns this output:
Outputs: Solver :
APOPT (v1.0) Solution time : 27.2771999999968 sec
Objective : 40884011.5968099
Successful solution --------------------------------------------------
Optimized, a = 3.8708321781 Optimized, b = -31.844822393
Optimized, c = 7.8648564579 Optimized, d = 1.0244814518
The objective value is high. Without Gekko the objective is calculated as 0.01904060781034217. Why is it different?
It is hard to diagnose the problem because it is missing the data to run and verify. Here are a couple things to change:
Set yest as a Variable instead of a CV. It is automatically adding squared error terms when you declare a CV with FSTATUS=1 as described in the Dynamic Optimization course and in the Estimator Tuning Lesson. Because you are defining a custom objective, there is no need to declare a CV.
#yest = m.CV(value=0)
#yest.FSTATUS=1
yest = m.Var(value=0)
For a direct comparison, try declaring Intermediate variables to inspect the parts of the objective. Also, use m.sum() instead of np.sum() for the Gekko version of summation. The den piece is a number so it can be pre-calculated before the objective function is defined with ym.
den = (np.sum((ym-np.mean(ym))**2)
m.Obj(m.sqrt((np.sum((ymeas-yest)**2))/(den*N*N)))
Please post complete minimal, verifiable code for more specific help.
so we corrected the code like said in the previous comment but we get problems when we try to take a sum of a gekko parameter and we don't understand why. s is the variable where we calculate a sum and try to use it in the next equation but it doesn't work. Even with numpy.sum() it doesn't sum it up. We get this error: TypeError: x must be a python list of GEKKO parameters, variables, or expressions
Any idea what we need to change on factor k or f so the sum can be achieved?
x1= np.array(self.df['temperature_geo'])
x2= np.array(self.get_weekday_parameters())
x3= np.int(self.annual_heat_demand)
x4= np.array(self.get_sf_values())
ym= np.array(ym)
# GEKKO model
m = GEKKO(remote=False)
# variables
T_g= m.Param(value=x1)
f=m.Param(value=x2)
annual_demand=m.Param(value=x3)
sf=m.Param(value=x4)
ymeas = m.Param(value=ym)
yest = m.Var(value=0)
# regression equation
k = m.Intermediate((a / (1 + (b / (T_g - 40)) ** c) + d))
s=m.Intermediate(m.sum(k*f))
kw=m.Intermediate( 1.0 / (s / 24))
m.Equation(yest == (k* kw * f * sf) * annual_demand)
den = (m.sum((ymeas-np.mean(ymeas))**2))
ben = m.sum((ymeas-yest)**2)
m.Obj(m.sqrt((ben)/N/(den)/N))
# print('Obj init value = ' + str(object_af.value))
# regression mode
m.options.IMODE = 2
m.options.SOLVER = 1 # considering APOPT solver for 1 and IPOPT for 3
# optimize
m.options.MAX_ITER = 20
m.options.OTOL = 1.0e-10
m.options.RTOL = 1.0e-10
m.solve(disp=True)
I have an stochastic differential equation (SDE) that I am trying to solve using Milsteins method but am getting results that disagree with experiment.
The SDE is
which I have broken up into 2 first order equations:
eq1:
eq2:
Then I have used the Ito form:
So that for eq1:
and for eq2:
My python code used to attempt to solve this is like so:
# set constants from real data
Gamma0 = 4000 # defines enviromental damping
Omega0 = 75e3*2*np.pi # defines the angular frequency of the motion
eta = 0 # set eta 0 => no effect from non-linear p*q**2 term
T_0 = 300 # temperature of enviroment
k_b = scipy.constants.Boltzmann
m = 3.1e-19 # mass of oscillator
# set a and b functions for these 2 equations
def a_p(t, p, q):
return -(Gamma0 - Omega0*eta*q**2)*p
def b_p(t, p, q):
return np.sqrt(2*Gamma0*k_b*T_0/m)
def a_q(t, p, q):
return p
# generate time data
dt = 10e-11
tArray = np.arange(0, 200e-6, dt)
# initialise q and p arrays and set initial conditions to 0, 0
q0 = 0
p0 = 0
q = np.zeros_like(tArray)
p = np.zeros_like(tArray)
q[0] = q0
p[0] = p0
# generate normally distributed random numbers
dwArray = np.random.normal(0, np.sqrt(dt), len(tArray)) # independent and identically distributed normal random variables with expected value 0 and variance dt
# iterate through implementing Milstein's method (technically Euler-Maruyama since b' = 0
for n, t in enumerate(tArray[:-1]):
dw = dwArray[n]
p[n+1] = p[n] + a_p(t, p[n], q[n])*dt + b_p(t, p[n], q[n])*dw + 0
q[n+1] = q[n] + a_q(t, p[n], q[n])*dt + 0
Where in this case p is velocity and q is position.
I then get the following plots of q and p:
I expected the resulting plot of position to look something like the following, which I get from experimental data (from which the constants used in the model are determined):
Have I implemented Milstein's method correctly?
If I have, what else might be wrong my process of solving the SDE that'd causing this disagreement with the experiment?
You missed a term in the drift coefficient, note that to the right of dp there are two dt terms. Thus
def a_p(t, p, q):
return -(Gamma0 - Omega0*eta*q**2)*p - Omega0**2*q
which is actually the part that makes the oscillator into an oscillator. With that corrected the solution looks like
And no, you did not implement the Milstein method as there are no derivatives of b_p which are what distinguishes Milstein from Euler-Maruyama, the missing term is +0.5*b'(X)*b(X)*(dW**2-dt).
There is also a derivative-free version of Milsteins method as a two-stage kind-of Runge-Kutta method, documented in wikipedia or the original in arxiv.org (PDF).
The step there is (vector based, duplicate into X=[p,q], K1=[k1_p,k1_q] etc. to be close to your conventions)
S = random_choice_of ([-1,1])
K1 = a(X )*dt + b(X )*(dW - S*sqrt(dt))
Xh = X + K1
K2 = a(Xh)*dt + b(Xh)*(dW + S*sqrt(dt))
X = X + 0.5 * (K1+K2)
I am trying to implement the Population Monte Carlo algorithm as described in this paper (see page 78 Fig.3) for a simple model (see function model()) with one parameter using Python. Unfortunately, the algorithm doesn't work and I can't figure out what's wrong. See my implementation below. The actual function is called abc(). All other functions can be seen as helper-functions and seem to work fine.
To check whether the algorithm workds, I first generate observed data with the only parameter of the model set to param = 8. Therefore, the posterior resulting from the ABC algorithm should be centered around 8. This is not the case and I'm wondering why.
I would appreciate any help or comments.
# imports
from math import exp
from math import log
from math import sqrt
import numpy as np
import random
from scipy.stats import norm
# globals
N = 300 # sample size
N_PARTICLE = 300 # number of particles
ITERS = 5 # number of decreasing thresholds
M = 10 # number of words to remember
MEAN = 7 # prior mean of parameter
SD = 2 # prior sd of parameter
def model(param):
recall_prob_all = 1/(1 + np.exp(M - param))
recall_prob_one_item = np.exp(np.log(recall_prob_all) / float(M))
return sum([1 if random.random() < recall_prob_one_item else 0 for item in range(M)])
## example
print "Output of model function: \n" + str(model(10)) + "\n"
# generate data from model
def generate(param):
out = np.empty(N)
for i in range(N):
out[i] = model(param)
return out
## example
print "Output of generate function: \n" + str(generate(10)) + "\n"
# distance function (sum of squared error)
def distance(obsData,simData):
out = 0.0
for i in range(len(obsData)):
out += (obsData[i] - simData[i]) * (obsData[i] - simData[i])
return out
## example
print "Output of distance function: \n" + str(distance([1,2,3],[4,5,6])) + "\n"
# sample new particles based on weights
def sample(particles, weights):
return np.random.choice(particles, 1, p=weights)
## example
print "Output of sample function: \n" + str(sample([1,2,3],[0.1,0.1,0.8])) + "\n"
# perturbance function
def perturb(variance):
return np.random.normal(0,sqrt(variance),1)[0]
## example
print "Output of perturb function: \n" + str(perturb(1)) + "\n"
# compute new weight
def computeWeight(prevWeights,prevParticles,prevVariance,currentParticle):
denom = 0.0
proposal = norm(currentParticle, sqrt(prevVariance))
prior = norm(MEAN,SD)
for i in range(len(prevParticles)):
denom += prevWeights[i] * proposal.pdf(prevParticles[i])
return prior.pdf(currentParticle)/denom
## example
prevWeights = [0.2,0.3,0.5]
prevParticles = [1,2,3]
prevVariance = 1
currentParticle = 2.5
print "Output of computeWeight function: \n" + str(computeWeight(prevWeights,prevParticles,prevVariance,currentParticle)) + "\n"
# normalize weights
def normalize(weights):
return weights/np.sum(weights)
## example
print "Output of normalize function: \n" + str(normalize([3.,5.,9.])) + "\n"
# sampling from prior distribution
def rprior():
return np.random.normal(MEAN,SD,1)[0]
## example
print "Output of rprior function: \n" + str(rprior()) + "\n"
# ABC using Population Monte Carlo sampling
def abc(obsData,eps):
draw = 0
Distance = 1e9
variance = np.empty(ITERS)
simData = np.empty(N)
particles = np.empty([ITERS,N_PARTICLE])
weights = np.empty([ITERS,N_PARTICLE])
for t in range(ITERS):
if t == 0:
for i in range(N_PARTICLE):
while(Distance > eps[t]):
draw = rprior()
simData = generate(draw)
Distance = distance(obsData,simData)
Distance = 1e9
particles[t][i] = draw
weights[t][i] = 1./N_PARTICLE
variance[t] = 2 * np.var(particles[t])
continue
for i in range(N_PARTICLE):
while(Distance > eps[t]):
draw = sample(particles[t-1],weights[t-1])
draw += perturb(variance[t-1])
simData = generate(draw)
Distance = distance(obsData,simData)
Distance = 1e9
particles[t][i] = draw
weights[t][i] = computeWeight(weights[t-1],particles[t-1],variance[t-1],particles[t][i])
weights[t] = normalize(weights[t])
variance[t] = 2 * np.var(particles[t])
return particles[ITERS-1]
true_param = 9
obsData = generate(true_param)
eps = [15000,10000,8000,6000,3000]
posterior = abc(obsData,eps)
#print posterior
I stumbled upon this question as I was looking for pythonic implementations of PMC algorithms, since, quite coincidentally, I'm currently in the process of applying the techniques in this exact paper to my own research.
Can you post the results you're getting? My guess is that 1) you're using a poor choice of distance function (and/or similarity thresholds), or 2) you're not using enough particles. I may be wrong here (I'm not very well-versed in sample statistics), but your distance function implicitly suggests to me that the ordering of your random draws matters. I'd have to think about this more to determine whether it actually has any effect on the convergence properties (it may not), but why don't you simply use the mean or median as your sample statistic?
I ran your code with 1000 particles and a true parameter value of 8, while using the absolute difference between sample means as my distance function, for three iterations with epsilons of [0.5, 0.3, 0.1]; the peak of my estimated posterior distribution seems to be approaching 8 just like it should on each iteration, alongside a reduction in the population variance. Note that there is still a noticeable rightward bias, but this is because of the asymmetry of your model (parameter values of 8 or less can never result in more than 8 observed successes, while all parameters values greater than 8 can, leading to a rightward skewedness in the distribution).
Here's the plot of my results:
I'm running the minimization below:
from scipy.optimize import minimize
import numpy as np
import math
import matplotlib.pyplot as plt
### objective function ###
def Rlzd_Vol1(w1, S):
L = len(S) - 1
m = len(S[0])
# Compute log returns, size (L, m)
LR = np.array([np.diff(np.log(S[:,j])) for j in xrange(m)]).T
# Compute weighted returns
w = np.array([w1, 1.0 - w1])
R = np.array([np.sum(w*LR[i,:]) for i in xrange(L)]) # size L
# Compute Realized Vol.
vol = np.std(R) * math.sqrt(260)
return vol
# stock prices
S = np.exp(np.random.normal(size=(50,2)))
### optimization ###
obj_fun = lambda w1: Rlzd_Vol1(w1, S)
w1_0 = 0.1
res = minimize(obj_fun, w1_0)
print res
### Plot objective function ###
fig_obj = plt.figure()
ax_obj = fig_obj.add_subplot(111)
n = 100
w1 = np.linspace(0.0, 1.0, n)
y_obj = np.zeros(n)
for i in xrange(n):
y_obj[i] = obj_fun(w1[i])
ax_obj.plot(w1, y_obj)
plt.show()
The objective function shows an obvious minimum (it's quadratic):
But the minimization output tells me the minimum is at 0.1, the initial point:
I cannot figure out what's going wrong. Any thoughts?
w1 is passed in as a (single entry) vector and not as scalar from the minimize routine. Try what happens if you define w1 = np.array([0.2]) and then calculate w = np.array([w1, 1.0 - w1]). You'll see you get a 2x1 matrix instead of a 2 entry vector.
To make your objective function able to handle w1 being an array you can simply put in an explicit conversion to float w1 = float(w1) as the first line of Rlzd_Vol1. Doing so I obtain the correct minimum.
Note that you might want to use scipy.optimize.minimize_scalar instead especially if you can bracket where you minimum will be.