convergence issue encountered while implementing policy iteration from scratch

convergence issue encountered while implementing policy iteration from scratch - python

I am trying to implement policy iteration from scratch. I have a 2D grid world environment named GridWorld that returns the successor state and reward from a given action, and it also has a function that returns the transition probability. Below is my code for policy iteration:
import matplotlib
matplotlib.use('Agg')
import random
import numpy as np
import matplotlib.pyplot as plt
import gridworld
from tqdm import tqdm
class PolicyIteration:
def __init__(self, env, gamma):
self.env = env
self.num_states = self.env.num_states
self.num_actions = self.env.num_actions
self.max_num_steps = self.env.max_num_steps
self.gamma = gamma #discount factor
self.values = np.zeros(self.num_states) #Initialize `values` as zeros
self.policy = np.random.randint(0, self.num_actions, self.num_states)
def one_policy_evaluation(self):
"""
Runs one iteration of policy evaluation and updates the value function.
:return: the maximum change in value function
"""
delta = 0
for s in range(self.num_states):
v = self.values[s]
a = self.policy[s]
(s_new, r, _) = self.env.step(a)
p = self.env.p(s_new, s, a)
""" update V(s)"""
self.values[s] = np.sum(p * (r + self.gamma * self.values[s_new]))
delta = max(delta, abs(v - self.values[s]))
return delta
def run_policy_evaluation(self, tol = 1e-3):
"""
Runs policy evaluation until convergence.
:param tol: the tolerance level for convergence
:return: the number of iterations of policy evaluation until convergence
"""
delta = self.one_policy_evaluation()
delta_history = [delta]
while delta > tol:
delta = self.one_policy_evaluation()
delta_history.append(delta)
return len(delta_history)
def run_policy_improvement(self):
update_policy_count = 0
for s in range(self.num_states):
temp = self.policy[s]
v_list = np.zeros(self.num_actions)
for a in range(self.num_actions):
(s_new, r, _) = self.env.step(a)
p = self.env.p(s_new, s, a)
v_list[a] = np.sum(p * (r + self.gamma * self.values[s_new]))
self.policy[s] = np.argmax(v_list)
if temp != self.policy[s]:
update_policy_count += 1
return update_policy_count
def train(self, tol=1e-3, max_iters=100, plot=True):
eval_count = self.run_policy_evaluation(tol)
eval_count_history = [eval_count]
policy_change = self.run_policy_improvement()
policy_change_history = [policy_change]
epoch = 0
val_history= []
for i in tqdm(range(max_iters)):
epoch += 1
new_eval_count = self.run_policy_evaluation(tol)
new_policy_change = self.run_policy_improvement()
eval_count_history.append(new_eval_count)
policy_change_history.append(new_policy_change)
val_history.append(np.mean(self.values))
if new_policy_change == 0:
break
print(f'# epoch: {len(policy_change_history)}')
print(f'eval count = {eval_count_history}')
print(f'policy change = {policy_change_history}')
if plot is True:
plt.figure(dpi=200)
plt.plot(val_history)
plt.tight_layout()
plt.savefig('policy_iteration.png')
plt.show()
def main():
env = gridworld.GridWorld(hard_version=False)
agent = PolicyIteration(env, gamma=0.95)
agent.train()
if __name__ == '__main__':
main()
However, based on the figure generated, the sequence of values is oscillating up and down and never converges. I followed the algorithm in the Sutton book step by step, and can't find any issues with my code yet:
Any help is greatly appreciated!

Related

Inconsistency in ODE solution using solve_ivp when adding events

I am using scipy.solve_ivp to solve an ODE in a loop, such that in every iteration I change some term in the ODE (which is like the force applied).
In terms of my problem - I just cant seem to understand why when I track events of zero crossing the solution sometimes stalls and essentially zeros out (as the figure below shows).
The code is a bit long, but only a small portion of it is actually relevant to my problem
(1) initialize some constants
MASS = 2.6e-4
WING_LENGTH = 0.07 # meters
GYRATION_RADIUS = 0.6 * WING_LENGTH
MoI = MASS * GYRATION_RADIUS ** 2 # Moment of Inertia
AIR_DENSITY = 1.2
WING_AREA = 0.5 * WING_LENGTH * (0.5 * WING_LENGTH) * np.pi # 1/2 ellipse with minor radios ~ 1/2 major = length/2
# drag coefficients from whitney & wood (JFM 2010):
C_D_MAX = 3.4
C_D_0 = 0.4
C_L_MAX = 1.8
ZERO_CROSSING = 1
RADIAN45 = np.pi / 4
RADIAN135 = 3 * RADIAN45
(2) Define a wrapper which is a RobotSimulation class - it's an easy interface that allows to easily change the values of the ODE. The important function is solve_dynamics (the others are not part of the problem I believe)
def phi_dot_zero_crossing_event(t, y):
"""
this event is given to solve_ivp to track if phi_dot == 0
:param t: unused time variable
:param y: a vector of [phi,phi_dot]
:return:
"""
return y[1]
class RobotSimulation:
def __init__(self, motor_torque=lambda x: 0, alpha=RADIAN45, phi0=0.0, phi_dot0=0.01, start_t=0, end_t=0.05, delta_t=0.001):
self.solution = 0
self.motor_torque = motor_torque
self.alpha = alpha # angle of attack
self.phi0 = phi0
self.phi_dot0 = phi_dot0
self.start_t = start_t
self.end_t = end_t
self.delta_t = delta_t
def set_time(self, new_start, new_end) -> None:
"""
we update the start time to be the end of the prev time and the end time
as the last end time + the window size we wish
"""
self.start_t = new_start
self.end_t = new_end
def set_init_cond(self, new_phi0, new_phi_dot0) -> None:
"""
we update the initial conditions to be the last value of previous iterations
"""
self.phi0 = new_phi0
self.phi_dot0 = new_phi_dot0
def set_motor_torque(self, new_motor_torque) -> None:
"""
sets the moment to the new value
"""
self.motor_torque = new_motor_torque
def flip_alpha(self) -> None:
"""
(Irrelevant for my question) changes the angle of attack's sign
:return:
"""
self.alpha = RADIAN135 if self.alpha == RADIAN45 else RADIAN45
def c_drag(self) -> float:
"""
calculates the drag coefficient based on the angle of attack
"""
return (C_D_MAX + C_D_0) / 2 - (C_D_MAX - C_D_0) / 2 * np.cos(2 * self.alpha)
def drag_torque(self, phi_dot):
"""
the drag moment
"""
return 0.5 * AIR_DENSITY * WING_AREA * self.c_drag() * GYRATION_RADIUS * phi_dot * np.abs(phi_dot)
def phi_derivatives(self, t, y):
"""
A function that defines the ODE that is to be solved: I * phi_ddot = tau_z - tau_drag.
We think of y as a vector y = [phi,phi_dot]. the ode solves dy/dt = f(y,t)
:return:
"""
phi, phi_dot = y[0], y[1]
dy_dt = [phi_dot, (self.motor_torque(t) - self.drag_torque(phi_dot)) / MoI]
return dy_dt
def solve_dynamics(self, phiarr, phidotarr, phiddotarr, angarr, torquearr):
"""
solves the ODE
:return:
"""
phi_0, phi_dot_0 = self.phi0, self.phi_dot0
start_t, end_t, delta_t = self.start_t, self.end_t, self.delta_t
phi_dot_zero_crossing_event.terminal = True
ang = []
times_between_zero_cross = []
sol_between_zero_cross = []
while start_t < end_t:
torquearr.append(self.motor_torque(start_t))
sol = solve_ivp(self.phi_derivatives, t_span=(start_t, end_t), y0=[phi_0,phi_dot_0],events=phi_dot_zero_crossing_event)
self.solution = sol.y
ang.append(self.alpha * np.ones(len(sol.t))) # set alpha for every t based on solution's size
times_between_zero_cross.append(sol.t)
sol_between_zero_cross.append(sol.y)
if sol.status == ZERO_CROSSING:
start_t = sol.t[-1] + delta_t
phi_0, phi_dot_0 = sol.y[0][-1], sol.y[1][-1] # last step is now initial value
self.flip_alpha()
else: # no zero crossing
break
time = np.concatenate(times_between_zero_cross)
phi, phi_dot = np.concatenate(sol_between_zero_cross, axis=1)
ang = np.concatenate(ang)
_, phi_ddot = self.phi_derivatives(time, [phi, phi_dot])
phiarr.append(phi)
phidotarr.append(phi_dot)
phiddotarr.append(phi_ddot)
angarr.append(ang)
(3) Finally the main function calls solve_dynamics sequentially for different values of actions and updates the time window and initial conditions:
phi_arr, phi_dot_arr, phi_ddot_arr, angarr, torquearr = [], [], [], [], []
phi0, phidot0 = 0, 0.01
start_t, end_t, delta_t = 0, 0.05, 0.001
sim = RobotSimulation()
for action in [1, -1, 1, -1, 1, -1]:
sim.set_motor_torque(lambda x: action)
sim.solve_dynamics(phi_arr, phi_dot_arr, phi_ddot_arr, angarr, torquearr)
phi0, phidot0 = sim.solution[0][-1], sim.solution[1][-1]
start_t = end_t
end_t += 0.05
sim.set_init_cond(phi0, phidot0)
sim.set_time(start_t, end_t)
phi_arr = np.concatenate(phi_arr)
phi_dot_arr = np.concatenate(phi_dot_arr)
phi_ddot_arr = np.concatenate(phi_ddot_arr)
angle_arr = np.concatenate(angarr)
Intuitively, as the actions I apply are just repeating +1 and -1, I cannot understand why the blue points in the figure (see below) act so different

Eventually, the problem was with how solve_ivp finds zero crossing points. It seems that the solution kept on finding the same crossings from + to - crossing every time (Thanks Lutz), so I've added a restriction using the .direction variable:
before entering the while loop: phi_dot_zero_crossing_event.direction = -np.sign(phi_dot_0). This makes sure we start by finding a crossing in the direction of the initial velocity.
when identifying an event in if sol.status == 1: I changed the sign of the direction with phi_dot_zero_crossing_event.direction *= -1

Model doesn't seem to learn

I used this gym library to try and get this model to learn, but I don't think it learns from experience. Something is wrong, but I can't figure it out.
I have played with the DISCOUNT, LEARNING_RATE, DISCRETE_OS_SIZE and still nothing, do i have to create a neural network for this example? Or can I just use the formula to derive the q values?
import gym
import numpy as np
LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 25000
env = gym.make("MountainCar-v0")
DISCRETE_OS_SIZE = [20, 20]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE
q_tables = np.random.uniform(low = -2, high = 0, size = (DISCRETE_OS_SIZE + [env.action_space.n]))
def get_discrete_state(state):
discrete_state = (state - env.observation_space.low)/DISCRETE_OS_SIZE
return tuple(discrete_state.astype(np.int)) # we use this tuple to look up the 3 Q values for the available actions in the q-table
# Exploration settings
epsilon = 1 # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)
SHOW_EVERY = 1000
done = False
for episode in range(EPISODES):
discrete_state = get_discrete_state(env.reset())
done = False
if episode % SHOW_EVERY == 0:
render = True
print(episode)
else:
render = False
while not done:
if np.random.random() > epsilon:
# Get action from Q table
action = np.argmax(q_tables[discrete_state])
else:
# Get random action
action = np.random.randint(0, env.action_space.n)
new_state, reward, done, _ = env.step(action)#state = position and velocit
new_discrete_state = get_discrete_state(new_state)
if render:
env.render()
if not done:
max_future_q = np.max(q_tables[new_discrete_state]) # Maximum possible Q value in next step (for new state)
current_q = q_tables[discrete_state + (action,)]# Current Q value (for current state and performed action)
new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)# And here's our equation for a new Q value for current state and action
q_tables[discrete_state + (action,)] = new_q# Update Q table with new Q value
# Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly
elif new_state[0] >= env.goal_position:
q_tables[discrete_state + (action,)] = 0
print(episode)
discrete_state = new_discrete_state
# Decaying is being done every episode if episode number is within decaying range
if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
epsilon -= epsilon_decay_value
env.close()

I have found what the problem was.
In this line discrete_state = (state - env.observation_space.low)/DISCRETE_OS_SIZE I am dividing the (current state - env.observation_space.low) by DISCRETE_OS_SIZE. I came to the conclusion that I am better of dividing by discrete_os_win_size, which solved my problem.

Understanding Gradient Policy Deriving

I'm trying to recreate very simple example of Policy Gradient, from it's origin resource Andrej Karpathy blog. In that articale, you will find example with CartPole and Policy Gradient with list of weight and Softmax activation. Here is my recreated and very simple example of CartPole policy gradient, which works perfect.
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
import copy
NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99
# noinspection PyMethodMayBeStatic
class Agent:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.rand(5, 2)
def policy(self, state):
z = state.dot(self.w)
exp = np.exp(z)
return exp/np.sum(exp)
def __softmax_grad(self, softmax):
s = softmax.reshape(-1,1)
return np.diagflat(s) - np.dot(s, s.T)
def grad(self, probs, action, state):
dsoftmax = self.__softmax_grad(probs)[action,:]
dlog = dsoftmax / probs[0,action]
grad = state.T.dot(dlog[None,:])
return grad
def update_with(self, grads, rewards):
for i in range(len(grads)):
# Loop through everything that happend in the episode
# and update towards the log policy gradient times **FUTURE** reward
total_grad_effect = 0
for t, r in enumerate(rewards[i:]):
total_grad_effect += r * (GAMMA ** r)
self.w += LEARNING_RATE * grads[i] * total_grad_effect
print("Grads update: " + str(np.sum(grads[i])))
def main(argv):
env = gym.make('CartPole-v0')
np.random.seed(1)
agent = Agent()
complete_scores = []
for e in range(NUM_EPISODES):
state = env.reset()[None, :]
state = agent.poly.fit_transform(state)
rewards = []
grads = []
score = 0
while True:
probs = agent.policy(state)
action_space = env.action_space.n
action = np.random.choice(action_space, p=probs[0])
next_state, reward, done,_ = env.step(action)
next_state = next_state[None,:]
next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
grad = agent.grad(probs, action, state)
grads.append(grad)
rewards.append(reward)
score += reward
state = next_state
if done:
break
agent.update_with(grads, rewards)
complete_scores.append(score)
env.close()
plt.plot(np.arange(NUM_EPISODES),
complete_scores)
plt.savefig('image1.png')
if __name__ == '__main__':
main(None)
.
.
Question
I'm trying to do, almost the same example but with Sigmoid activation (just for simplicity). That is all I need to do. Switch activation in the model from softmax to the sigmoid. Which should work for sure (based on explanation below). But my Policy Gradient model don't learn anything and keeps random. Any suggestion?
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99
# noinspection PyMethodMayBeStatic
class Agent:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.rand(5, 1) - 0.5
# Our policy that maps state to action parameterized by w
# noinspection PyShadowingNames
def policy(self, state):
z = np.sum(state.dot(self.w))
return self.sigmoid(z)
def sigmoid(self, x):
s = 1 / (1 + np.exp(-x))
return s
def sigmoid_grad(self, sig_x):
return sig_x * (1 - sig_x)
def grad(self, probs, action, state):
dsoftmax = self.sigmoid_grad(probs)
dlog = dsoftmax / probs
grad = state.T.dot(dlog)
grad = grad.reshape(5, 1)
return grad
def update_with(self, grads, rewards):
if len(grads) < 50:
return
for i in range(len(grads)):
# Loop through everything that happened in the episode
# and update towards the log policy gradient times **FUTURE** reward
total_grad_effect = 0
for t, r in enumerate(rewards[i:]):
total_grad_effect += r * (GAMMA ** r)
self.w += LEARNING_RATE * grads[i] * total_grad_effect
def main(argv):
env = gym.make('CartPole-v0')
np.random.seed(1)
agent = Agent()
complete_scores = []
for e in range(NUM_EPISODES):
state = env.reset()[None, :]
state = agent.poly.fit_transform(state)
rewards = []
grads = []
score = 0
while True:
probs = agent.policy(state)
action_space = env.action_space.n
action = np.random.choice(action_space, p=[1 - probs, probs])
next_state, reward, done, _ = env.step(action)
next_state = next_state[None, :]
next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
grad = agent.grad(probs, action, state)
grads.append(grad)
rewards.append(reward)
score += reward
state = next_state
if done:
break
agent.update_with(grads, rewards)
complete_scores.append(score)
env.close()
plt.plot(np.arange(NUM_EPISODES),
complete_scores)
plt.savefig('image1.png')
if __name__ == '__main__':
main(None)
Plotting all the learning keeps random. Nothing helps with tuning hyper parameters. Below the sample image.
References:
Deep Reinforcement Learning: Pong from Pixels
An introduction to Policy Gradients with Cartpole and Doom
Deriving Policy Gradients and Implementing REINFORCE
Machine Learning Trick of the Day (5): Log Derivative Trick 12
UPDATE
Seems like answer below could doing some work from the graphic. But it's Not log probability, and Not even gradient of the policy. And changes whole purpose of the RL Gradient Policy. Please check references above. Following the image we next statement.
I need to take a Gradient of the Log function of my Policy (which is simply weights and sigmoid activation). Please let me know in case of any questions.

The problem is with grad method.
def grad(self, probs, action, state):
dsoftmax = self.sigmoid_grad(probs)
dlog = dsoftmax / probs
grad = state.T.dot(dlog)
grad = grad.reshape(5, 1)
return grad
In the original code Softmax was used along with CrossEntropy loss function. When you switch activation to Sigmoid, the proper loss function becomes Binary CrossEntropy. Now, the purpose of grad method is to calculate gradient of the loss function wrt. weights. Sparing the details, proper gradient is given by (probs - action) * state in the terminology of your program. The last thing is to add minus sign - we want to maximize the negative of the loss function.
Proper grad method thus:
def grad(self, probs, action, state):
grad = state.T.dot(probs - action)
return -grad
Another change that you might want to add is to increase learning rate.
LEARNING_RATE = 0.0001 and NUM_EPISODES = 5000 will produce the following plot:
The convergence will be much faster if weights are initialized using Gaussian distribution with zero mean and small variance:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.randn(5, 1) * 0.01
UPDATE
Added complete code to reproduce the results:
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
NUM_EPISODES = 5000
LEARNING_RATE = 0.0001
GAMMA = 0.99
# noinspection PyMethodMayBeStatic
class Agent:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.randn(5, 1) * 0.01
# Our policy that maps state to action parameterized by w
# noinspection PyShadowingNames
def policy(self, state):
z = np.sum(state.dot(self.w))
return self.sigmoid(z)
def sigmoid(self, x):
s = 1 / (1 + np.exp(-x))
return s
def sigmoid_grad(self, sig_x):
return sig_x * (1 - sig_x)
def grad(self, probs, action, state):
grad = state.T.dot(probs - action)
return -grad
def update_with(self, grads, rewards):
if len(grads) < 50:
return
for i in range(len(grads)):
# Loop through everything that happened in the episode
# and update towards the log policy gradient times **FUTURE** reward
total_grad_effect = 0
for t, r in enumerate(rewards[i:]):
total_grad_effect += r * (GAMMA ** r)
self.w += LEARNING_RATE * grads[i] * total_grad_effect
def main(argv):
env = gym.make('CartPole-v0')
np.random.seed(1)
agent = Agent()
complete_scores = []
for e in range(NUM_EPISODES):
state = env.reset()[None, :]
state = agent.poly.fit_transform(state)
rewards = []
grads = []
score = 0
while True:
probs = agent.policy(state)
action_space = env.action_space.n
action = np.random.choice(action_space, p=[1 - probs, probs])
next_state, reward, done, _ = env.step(action)
next_state = next_state[None, :]
next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
grad = agent.grad(probs, action, state)
grads.append(grad)
rewards.append(reward)
score += reward
state = next_state
if done:
break
agent.update_with(grads, rewards)
complete_scores.append(score)
env.close()
plt.plot(np.arange(NUM_EPISODES),
complete_scores)
plt.savefig('image1.png')
if __name__ == '__main__':
main(None)

Build prediction model from tensors in tensorflow without inputing data

I am trying to interface CasADi and Tensorflow. CasADi is a toolbox that uses symbolic variables and does automatic differentiation. It is often used for dynamic/static optimization problems.
I found an example where GPflow is used (https://web.casadi.org/blog/tensorflow/). In this case, the GP model is firstly trained with data as follows
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))
model = gpflow.models.GPR(data, value, gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
gpflow.train.ScipyOptimizer().minimize(model)
Then the prediction model is build without passing the real values but a tensor
X = tf.placeholder(shape=(1,nd),dtype=np.float64)
[mean,_] = model._build_predict(X)
Such that CasADi can substitute real values by using a callback function that calls tensorflow.
I want to use the tf.keras.Sequential() model instead of a GPflow model since I want to implement a recurrent neural network. But for the sequential model the method _build_predict(X) does not exist. I tried to use just predict but I get the following error
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype double and shape [35039,1,8]
[[{{node Placeholder}}]]
Do you know what is the equivalent in this case?
Here the complete code using GPflow
from casadi import *
T = 10. # Time horizon
N = 20 # number of control intervals
# Declare model variables
x1 = MX.sym('x1')
x2 = MX.sym('x2')
x = vertcat(x1, x2)
u = MX.sym('u')
# Model equations
xdot = vertcat((1-x2**2)*x1 - x2 + u, x1)
# Formulate discrete time dynamics
if False:
# CVODES from the SUNDIALS suite
dae = {'x':x, 'p':u, 'ode':xdot}
opts = {'tf':T/N}
F = integrator('F', 'cvodes', dae, opts)
else:
# Fixed step Runge-Kutta 4 integrator
M = 4 # RK4 steps per interval
DT = T/N/M
f = Function('f', [x, u], [xdot])
X0 = MX.sym('X0', 2)
U = MX.sym('U')
X = X0
Q = 0
for j in range(M):
k1 = f(X, U)
k2 = f(X + DT/2 * k1, U)
k3 = f(X + DT/2 * k2, U)
k4 = f(X + DT * k3, U)
X=X+DT/6*(k1 +2*k2 +2*k3 +k4)
F = Function('F', [X0, U], [X],['x0','p'],['xf'])
# Start with an empty NLP
w=[]
w0 = []
lbw = []
ubw = []
g=[]
lbg = []
ubg = []
# "Lift" initial conditions
Xk = MX.sym('X0', 2)
w += [Xk]
lbw += [0, 1]
ubw += [0, 1]
w0 += [0, 1]
# Formulate the NLP
for k in range(N):
# New NLP variable for the control
Uk = MX.sym('U_' + str(k))
w += [Uk]
lbw += [-1]
ubw += [1]
w0 += [0]
# Integrate till the end of the interval
Fk = F(x0=Xk, p=Uk)
Xk_end = Fk['xf']
# New NLP variable for state at end of interval
Xk = MX.sym('X_' + str(k+1), 2)
w += [Xk]
lbw += [-0.25, -inf]
ubw += [ inf, inf]
w0 += [0, 0]
# Add equality constraint
g += [Xk_end-Xk]
lbg += [0, 0]
ubg += [0, 0]
nd = N+1
import gpflow
import time
from tensorflow_casadi import TensorFlowEvaluator
class GPR(TensorFlowEvaluator):
def __init__(self, model, session, opts={}):
X = tf.placeholder(shape=(1,nd),dtype=np.float64)
[mean,_] = model._build_predict(X)
mean = tf.reshape(mean,(1,1))
TensorFlowEvaluator.__init__(self,[X],[mean],session,opts)
self.counter = 0
self.time = 0
def eval(self,arg):
self.counter += 1
t0 = time.time()
ret = TensorFlowEvaluator.eval(self,arg)
self.time += time.time()-t0
return [ret]
# Create
np.random.seed(0)
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))
model = gpflow.models.GPR(data, value, gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
gpflow.train.ScipyOptimizer().minimize(model)
import tensorflow as tf
with tf.Session() as session:
model.initialize()
GPR = GPR(model, session)
w = vertcat(*w)
# Create an NLP solver
prob = {'f': GPR(w[0::3]), 'x': w , 'g': vertcat(*g)}
options = {"ipopt": {"hessian_approximation": "limited-memory"}}
solver = nlpsol('solver', 'ipopt', prob,options);
# Solve the NLP
sol = solver(x0=w0, lbx=lbw, ubx=ubw, lbg=lbg, ubg=ubg)
print("Ncalls",GPR.counter)
print("Total time [s]",GPR.time)
w_opt = sol['x'].full().flatten()
# Plot the solution
x1_opt = w_opt[0::3]
x2_opt = w_opt[1::3]
u_opt = w_opt[2::3]
tgrid = [T/N*k for k in range(N+1)]
import matplotlib.pyplot as plt
plt.figure(1)
plt.clf()
plt.plot(tgrid, x1_opt, '--')
plt.plot(tgrid, x2_opt, '-')
plt.step(tgrid, vertcat(DM.nan(1), u_opt), '-.')
plt.xlabel('t')
plt.legend(['x1','x2','u'])
plt.grid()
plt.show()
and the class TensorFlowEvaluator
import casadi
import tensorflow as tf
class TensorFlowEvaluator(casadi.Callback):
def __init__(self,t_in,t_out,session, opts={}):
"""
t_in: list of inputs (tensorflow placeholders)
t_out: list of outputs (tensors dependeant on those placeholders)
session: a tensorflow session
"""
casadi.Callback.__init__(self)
assert isinstance(t_in,list)
self.t_in = t_in
assert isinstance(t_out,list)
self.t_out = t_out
self.construct("TensorFlowEvaluator", opts)
self.session = session
self.refs = []
def get_n_in(self): return len(self.t_in)
def get_n_out(self): return len(self.t_out)
def get_sparsity_in(self,i):
return casadi.Sparsity.dense(*self.t_in[i].get_shape().as_list())
def get_sparsity_out(self,i):
return casadi.Sparsity.dense(*self.t_out[i].get_shape().as_list())
def eval(self,arg):
# Associate each tensorflow input with the numerical argument passed by CasADi
d = dict((v,arg[i].toarray()) for i,v in enumerate(self.t_in))
# Evaluate the tensorflow expressions
ret = self.session.run(self.t_out,feed_dict=d)
return ret
# Vanilla tensorflow offers just the reverse mode AD
def has_reverse(self,nadj): return nadj==1
def get_reverse(self,nadj,name,inames,onames,opts):
# Construct tensorflow placeholders for the reverse seeds
adj_seed = [tf.placeholder(shape=self.sparsity_out(i).shape,dtype=tf.float64) for i in range(self.n_out())]
# Construct the reverse tensorflow graph through 'gradients'
grad = tf.gradients(self.t_out, self.t_in,grad_ys=adj_seed)
# Create another TensorFlowEvaluator object
callback = TensorFlowEvaluator(self.t_in+adj_seed,grad,self.session)
# Make sure you keep a reference to it
self.refs.append(callback)
# Package it in the nominal_in+nominal_out+adj_seed form that CasADi expects
nominal_in = self.mx_in()
nominal_out = self.mx_out()
adj_seed = self.mx_out()
return casadi.Function(name,nominal_in+nominal_out+adj_seed,callback.call(nominal_in+adj_seed),inames,onames)
if __name__=="__main__":
from casadi import *
a = tf.placeholder(shape=(2,2),dtype=tf.float64)
b = tf.placeholder(shape=(2,1),dtype=tf.float64)
y = tf.matmul(tf.sin(a), b)
with tf.Session() as session:
f_tf = TensorFlowEvaluator([a,b], [y], session)
a = MX.sym("a",2,2)
b = MX.sym("a",2,1)
y = f_tf(a,b)
yref = mtimes(sin(a),b)
f = Function('f',[a,b],[y])
fref = Function('f',[a,b],[yref])
print(f(DM([[1,2],[3,4]]),DM([[1],[3]])))
print(fref(DM([[1,2],[3,4]]),DM([[1],[3]])))
f = Function('f',[a,b],[jacobian(y,a)])
fref = Function('f',[a,b],[jacobian(yref,a)])
print(f(DM([[1,2],[3,4]]),DM([[1],[3]])))
print(fref(DM([[1,2],[3,4]]),DM([[1],[3]])))
And here is my attempt:
# design network
model = tf.keras.Sequential()
LSTM = tf.keras.layers.LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]))
model.add(LSTM) #, input_shape=(train_X.shape[1], train_X.shape[2]))
model.add(tf.keras.layers.Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=0, shuffle=False)
with tf.Session() as session:
testXshape = test_X.shape
GPR = GPR(model, session,testXshape)
Thanks!

I've let the TensorFlowEvaluator the same and created the GPR class this way:
class ValFcn(TensorFlowEvaluator):
import tensorflow as tf
def __init__(self, NN, session, opts={}):
self.X = self.tf.placeholder(shape=(1,4), dtype=self.tf.float32)
self.output = NN(self.X)
TensorFlowEvaluator.__init__(self, [self.X], [self.output], session, opts)
def eval(self, arg):
ret = TensorFlowEvaluator.eval(self, arg)
return ret
I was working with float32 so I had to change it there and in the TensorFlowEvaluator.
I'm actually using this model as a cost function term for an OCP.
Hope it works!

Scipy minimize constraint in pandas df

I am trying to run the minimization process found in this publication. The equation is seen on page 6 of the document 16 of the pdf.
I have a dataframe that looks like the below
df = pd.DataFrame({'h_t':[7.06398,6.29948,5.04570,6.20774,4.80106],
'p_atm':[101057.772801,101324.416001,101857.702401,101724.380801,101991.024001],
'q_p':[5.768132,3.825600,2.772215,5.830429,2.619304],
'q_s':[2.684433,3.403679,2.384275,1.008078,2.387106],
'tdg_f':[117.678100,110.131579,108.376963,103.669725,113.594771],
'tdg_tw':[121.052635,119.710907,114.921463,112.156868,115.444900],
'temp_water':[11.92,19.43,16.87,7.45,11.83]})
I have a constraint that says the below function must be positive where b1 and b3 are the coefficients I am optimizing.
def q_ge(q_p,q_s,b1,b3):
return min(q_p,(b1*q_s+b3))
I wrote my constraint below, but I am not sure if it is right.
def constraint_q_ge(x):
b1,b2,b3=x
power_flow = df.apply(lambda x:q_ge(x['q_p'],x['q_s'],b1,b3), axis = 1)
const = power_flow<0
return -const.sum()
Is this correct? I run the function on all rows and check if any are less than 0 and sum this. The negative of that sum should be greater than or equal to 0. If there is even a single value less than 0 this constraint is not met.
EDIT:
Below is the full problem.
from scipy.constants import g as gravity
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.optimize import minimize
import warnings
try:
from numpy import any as _any
except ImportError:
def _any(arg):
if arg is True:
return True
if arg is False:
return False
return any(arg)
def water_density(T=None, T0=None, units=None, a=None,
just_return_a=False, warn=True):
if units is None:
K = 1
m = 1
kg = 1
else:
K = units.Kelvin
m = units.meter
kg = units.kilogram
if T is None:
T = 298.15*K
m3 = m**3
if a is None:
a = (-3.983035*K, # C
301.797*K, # C
522528.9*K*K, # C**2
69.34881*K, # C
999.974950*kg/m3)
if just_return_a:
return a
if T0 is None:
T0 = 273.15*K
t = T - T0
if warn and (_any(t < 0*K) or _any(t > 40*K)):
warnings.warn("Temperature is outside range (0-40 degC)")
return a[4]*(1-((t + a[0])**2*(t + a[1]))/(a[2]*(t + a[3])))
def celsius_to_kelvin(t_celsius):
return t_celsius+273.15
def tailwater(h_t, temp_water, p_atm):
t_water_kelvin = celsius_to_kelvin(temp_water)
rho = water_density(t_water_kelvin)
g = gravity
return (1+(rho*g*h_t)/(2*p_atm))
def tailwater_tdg(q_s,q_p,x, h_t,temp_water,p_atm,tdg_f):
b1,b2,b3=x
A = ((q_s+b1*q_s+b3)/(q_s+q_p))
B = tailwater(h_t, temp_water, p_atm)
C = ((q_p-b1*q_s-b3)/(q_s+q_p))
return 100*A*B*b2+tdg_f*C
def q_ge(q_p,q_s,b1,b3):
return min(q_p,(b1*q_s+b3))
def rmse(y, y_hat):
return sqrt(mean_squared_error(y,y_hat))
def objective(x):
y_hat = df.apply(lambda r:tailwater_tdg(q_s=r['q_s'],q_p=r['q_p'],x=x, h_t=r['h_t'],temp_water=r['temp_water'],p_atm=r['p_atm'],tdg_f=r['tdg_f']), axis = 1)
y = df['tdg_tw']
return rmse(y, y_hat)
#constraints and bounds for optimization model. See reference for more information
def constraint_q_ge(x):
b1,b2,b3=x
power_flow = df.apply(lambda x:q_ge(x['q_p'],x['q_s'],b1,b3), axis = 1)
const = power_flow<0
return -const.sum()
constraints = [{'type':'ineq', 'fun':constraint_q_ge}]
bounds = [(-500,10000),(.00001,10000),(-500,10000)]
x0=[1,1,1]
sol = minimize(objective, x0, method = 'SLSQP',constraints = constraints, bounds = bounds,options={'disp':True, 'maxiter':100})

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

convergence issue encountered while implementing policy iteration from scratch - python

Related

Inconsistency in ODE solution using solve_ivp when adding events

Model doesn't seem to learn

Understanding Gradient Policy Deriving

Build prediction model from tensors in tensorflow without inputing data

Scipy minimize constraint in pandas df

Categories

Resources