I'm trying to implement the soft actor critic algorithm for discrete action space and I have trouble with the loss function.
Here is the link from SAC with continuous action space:
https://spinningup.openai.com/en/latest/algorithms/sac.html
I do not know what I'm doing wrong.
The problem is the network do not learn anything on the cartpole environment.
The full code on github: https://github.com/tk2232/sac_discrete/blob/master/sac_discrete.py
Here is my idea how to calculate the loss for discrete actions.
Value Network
class ValueNet:
def __init__(self, sess, state_size, hidden_dim, name):
self.sess = sess
with tf.variable_scope(name):
self.states = tf.placeholder(dtype=tf.float32, shape=[None, state_size], name='value_states')
self.targets = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='value_targets')
x = Dense(units=hidden_dim, activation='relu')(self.states)
x = Dense(units=hidden_dim, activation='relu')(x)
self.values = Dense(units=1, activation=None)(x)
optimizer = tf.train.AdamOptimizer(0.001)
loss = 0.5 * tf.reduce_mean((self.values - tf.stop_gradient(self.targets)) ** 2)
self.train_op = optimizer.minimize(loss, var_list=_params(name))
def get_value(self, s):
return self.sess.run(self.values, feed_dict={self.states: s})
def update(self, s, targets):
self.sess.run(self.train_op, feed_dict={self.states: s, self.targets: targets})
In the Q_Network I'm gather the values with the collected actions
Example
q_out = [[0.5533, 0.4444], [0.2222, 0.6666]]
collected_actions = [0, 1]
gather = [[0.5533], [0.6666]]
gather function
def gather_tensor(params, idx):
idx = tf.stack([tf.range(tf.shape(idx)[0]), idx[:, 0]], axis=-1)
params = tf.gather_nd(params, idx)
return params
Q Network
class SoftQNetwork:
def __init__(self, sess, state_size, action_size, hidden_dim, name):
self.sess = sess
with tf.variable_scope(name):
self.states = tf.placeholder(dtype=tf.float32, shape=[None, state_size], name='q_states')
self.targets = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='q_targets')
self.actions = tf.placeholder(dtype=tf.int32, shape=[None, 1], name='q_actions')
x = Dense(units=hidden_dim, activation='relu')(self.states)
x = Dense(units=hidden_dim, activation='relu')(x)
x = Dense(units=action_size, activation=None)(x)
self.q = tf.reshape(gather_tensor(x, self.actions), shape=(-1, 1))
optimizer = tf.train.AdamOptimizer(0.001)
loss = 0.5 * tf.reduce_mean((self.q - tf.stop_gradient(self.targets)) ** 2)
self.train_op = optimizer.minimize(loss, var_list=_params(name))
def update(self, s, a, target):
self.sess.run(self.train_op, feed_dict={self.states: s, self.actions: a, self.targets: target})
def get_q(self, s, a):
return self.sess.run(self.q, feed_dict={self.states: s, self.actions: a})
Policy Net
class PolicyNet:
def __init__(self, sess, state_size, action_size, hidden_dim):
self.sess = sess
with tf.variable_scope('policy_net'):
self.states = tf.placeholder(dtype=tf.float32, shape=[None, state_size], name='policy_states')
self.targets = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='policy_targets')
self.actions = tf.placeholder(dtype=tf.int32, shape=[None, 1], name='policy_actions')
x = Dense(units=hidden_dim, activation='relu')(self.states)
x = Dense(units=hidden_dim, activation='relu')(x)
self.logits = Dense(units=action_size, activation=None)(x)
dist = Categorical(logits=self.logits)
optimizer = tf.train.AdamOptimizer(0.001)
# Get action
self.new_action = dist.sample()
self.new_log_prob = dist.log_prob(self.new_action)
# Calc loss
log_prob = dist.log_prob(tf.squeeze(self.actions))
loss = tf.reduce_mean(tf.squeeze(self.targets) - 0.2 * log_prob)
self.train_op = optimizer.minimize(loss, var_list=_params('policy_net'))
def get_action(self, s):
action = self.sess.run(self.new_action, feed_dict={self.states: s[np.newaxis, :]})
return action[0]
def get_next_action(self, s):
next_action, next_log_prob = self.sess.run([self.new_action, self.new_log_prob], feed_dict={self.states: s})
return next_action.reshape((-1, 1)), next_log_prob.reshape((-1, 1))
def update(self, s, a, target):
self.sess.run(self.train_op, feed_dict={self.states: s, self.actions: a, self.targets: target})
Update function
def soft_q_update(batch_size, frame_idx):
gamma = 0.99
alpha = 0.2
state, action, reward, next_state, done = replay_buffer.sample(batch_size)
action = action.reshape((-1, 1))
reward = reward.reshape((-1, 1))
done = done.reshape((-1, 1))
Q_target
v_ = value_net_target.get_value(next_state)
q_target = reward + (1 - done) * gamma * v_
V_target
next_action, next_log_prob = policy_net.get_next_action(state)
q1 = soft_q_net_1.get_q(state, next_action)
q2 = soft_q_net_2.get_q(state, next_action)
q = np.minimum(q1, q2)
v_target = q - alpha * next_log_prob
Policy_target
q1 = soft_q_net_1.get_q(state, action)
q2 = soft_q_net_2.get_q(state, action)
policy_target = np.minimum(q1, q2)
Since the algorithm is generic to both discrete and continuous policy, the key idea is that we need a discrete distribution that is reparametrizable. Then the extension should involve minimal code modification from the continuous SAC --- by just changing the policy distribution class.
There is one such distribution — the GumbelSoftmax distribution. PyTorch does not have this built-in, so I simply extend it from a close cousin which has the right rsample() and add a correct log prob calculation method. With the ability to calculate a reparametrized action and its log prob, SAC works beautifully for discrete actions with minimal extra code, as seen below.
def calc_log_prob_action(self, action_pd, reparam=False):
'''Calculate log_probs and actions with option to reparametrize from paper eq. 11'''
samples = action_pd.rsample() if reparam else action_pd.sample()
if self.body.is_discrete: # this is straightforward using GumbelSoftmax
actions = samples
log_probs = action_pd.log_prob(actions)
else:
mus = samples
actions = self.scale_action(torch.tanh(mus))
# paper Appendix C. Enforcing Action Bounds for continuous actions
log_probs = (action_pd.log_prob(mus) - torch.log(1 - actions.pow(2) + 1e-6).sum(1))
return log_probs, actions
# ... for discrete action, GumbelSoftmax distribution
class GumbelSoftmax(distributions.RelaxedOneHotCategorical):
'''
A differentiable Categorical distribution using reparametrization trick with Gumbel-Softmax
Explanation http://amid.fish/assets/gumbel.html
NOTE: use this in place PyTorch's RelaxedOneHotCategorical distribution since its log_prob is not working right (returns positive values)
Papers:
[1] The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables (Maddison et al, 2017)
[2] Categorical Reparametrization with Gumbel-Softmax (Jang et al, 2017)
'''
def sample(self, sample_shape=torch.Size()):
'''Gumbel-softmax sampling. Note rsample is inherited from RelaxedOneHotCategorical'''
u = torch.empty(self.logits.size(), device=self.logits.device, dtype=self.logits.dtype).uniform_(0, 1)
noisy_logits = self.logits - torch.log(-torch.log(u))
return torch.argmax(noisy_logits, dim=-1)
def log_prob(self, value):
'''value is one-hot or relaxed'''
if value.shape != self.logits.shape:
value = F.one_hot(value.long(), self.logits.shape[-1]).float()
assert value.shape == self.logits.shape
return - torch.sum(- value * F.log_softmax(self.logits, -1), -1)
And here's the LunarLander results. SAC learns to solve it really fast.
The full implementation code is in SLM Lab at https://github.com/kengz/SLM-Lab/blob/master/slm_lab/agent/algorithm/sac.py
The SAC benchmark results on Roboschool (continuous) and LunarLander (discrete) are shown here: https://github.com/kengz/SLM-Lab/pull/399
There is a paper about SAC with discrete action spaces. It says SAC for discrete action spaces doesn't need re-parametrization tricks like Gumbel softmax. Instead, SAC needs some modifications. please refer to the paper for more details.
Paper /
Author's implementation (without codes for atari) /
Reproduction (with codes for atari)
I hope it helps you.
Probably this repo may be helpful. Description says, that repo contains an implementation of SAC for discrete action space on PyTorch. There is file with SAC algorithm for continuous action space and file with SAC adapted for discrete action space.
Pytorch 1.8 has RelaxedOneHotCategorical, this supports re-parameterized sampling using gumbel softmax.
import torch
import torch.nn as nn
from torch.distributions import RelaxedOneHotCategorical
class Policy(nn.Module):
def __init__(self, input_dims, hidden_dims, actions):
super().__init__()
self.mlp = nn.Sequential(nn.Linear(input_dims, hidden_dims), nn.SELU(inplace=True),
nn.Linear(hidden_dims, hidden_dims), nn.SELU(inplace=True),
nn.Linear(hidden_dims, out_dims))
def forward(self, state):
logits = torch.log_softmax(self.mlp(state), dim=-1)
return RelaxedOneHotCategorical(logits=logits, temperature=torch.ones(1) * 1.0)
>>> policy = Policy(4, 16, 2)
>>> a_dist = policy(torch.randn(8, 4))
>>> a_dist.rsample()
tensor([[0.0353, 0.9647],
[0.1348, 0.8652],
[0.1110, 0.8890],
[0.4956, 0.5044],
[0.6941, 0.3059],
[0.6126, 0.3874],
[0.2932, 0.7068],
[0.0498, 0.9502]], grad_fn=<ExpBackward>)
Related
I am trying to get to grips with Pytorch and I wanted to try to reproduce this code:
https://github.com/andy-psai/MountainCar_ActorCritic/blob/master/RL%20Blog%20FINAL%20MEDIUM%20code%2002_12_19.ipynb
in Pytorch.
I am having a problem in that this error is being returned:
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.
A similar question said to use zero_grad() again after the optimizer step, but this hasn't resolved the issue.
I've included the entire code below so hopefully it should be reproduceable.
Any advice would be much appreciated.
import gym
import os
import os.path as osp
import time
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
env = gym.envs.make("MountainCarContinuous-v0")
# Value function
class Value(nn.Module):
def __init__(self, dim_states):
super(Value, self).__init__()
self.net = nn.Sequential(
nn.Linear(dim_states, 400),
nn.ReLU(),
nn.Linear(400,400),
nn.ReLU(),
nn.Linear(400, 1)
)
self.optimizer = optim.Adam(self.parameters(), lr = 1e-3)
self.criterion = nn.MSELoss()
def forward(self, state):
return self.net(torch.from_numpy(state).float())
def compute_return(self, output, target):
self.optimizer.zero_grad()
loss = self.criterion(output, target)
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
# Policy network
class Policy(nn.Module):
def __init__(self, dim_states, env):
super(Policy, self).__init__()
self.hidden1 = nn.Linear(dim_states, 40)
self.hidden2 = nn.Linear(40, 40)
self.mu = nn.Linear(40, 1)
self.sigma = nn.Linear(40,1)
self.env = env
self.optimizer = optim.Adam(self.parameters(), lr = 2e-5)
def forward(self, state):
state = torch.from_numpy(state).float()
x = F.relu(self.hidden1(state))
x = F.relu(self.hidden2(x))
mu = self.mu(x)
sigma = F.softmax(self.sigma(x), dim=-1)
action_dist = Normal(mu, sigma)
action_var = action_dist.rsample()
action_var = torch.clip(action_var,
self.env.action_space.low[0],
self.env.action_space.high[0])
return action_var, action_dist
def compute_return(self, action, dist, td_error):
self.optimizer.zero_grad()
loss_actor = -dist.log_prob(action)*td_error
loss_actor.backward()
self.optimizer.step()
self.optimizer.zero_grad()
# Normalise the state space
import sklearn
import sklearn.preprocessing
state_space_samples = np.array(
[env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(state_space_samples)
# Normaliser
def scale_state(state):
scaled = scaler.transform([state])
return scaled
##################################
# Parameters
lr_actor = 0.00002
lr_critic = 0.001
actor = Policy(2, env)
critic = Value(2)
# Training loop params
gamma = 0.99
num_episodes = 300
episode_history = []
for episode in range(num_episodes):
# Receive initial state from E
state = env.reset()
reward_total = 0
steps = 0
done = False
while not done:
action, dist = actor(state)
# print(np.squeeze(action))
next_state, reward, done, _ = env.step(
np.array([action.item()]))
if episode % 50 == 0:
env.render()
steps += 1
reward_total += reward
# TD Target
target = reward + gamma * np.squeeze(critic(next_state), axis=0)
td_error = target - np.squeeze(critic(state), axis=0)
# Update actor
actor.compute_return(action, dist, td_error)
# Update critic
critic.compute_return(np.squeeze(critic(state), axis=0), target)
episode_history.append(reward_total)
print(f"Episode: {episode}, N Steps: {steps}, Cumulative reward {reward_total}")
if np.mean(episode_history[-100:]) > 90 and len(episode_history) > 101:
print("Solved")
print(f"Mean cumulative reward over 100 episodes {np.mean(episode_history[-100:])}")
Problem lies in this snippet. When you create target variable, there is a forward pass through critic which generates a computation graph and critic(next_state) is the leaf node of that graph making target a part of the graph (you can check this by printing target which will show you grad_fn=<AddBackward0>). Finally, when you call critic.compute_return(critic_out, target), a new computation graph is generated and passing target(which is a part of the previous computation graph) causes a Runtime error.
Solution is to call detach() on critic(next_state), this will free target variable and it will no longer be a part of the computation graph(again check by printing target).
target = reward + gamma * np.squeeze(critic(next_state).detach(), axis=0)
td_error = target - np.squeeze(critic(state), axis=0)
# Update actor
actor.compute_return(action, dist, td_error)
# Update critic
critic_out = np.squeeze(critic(state), axis=0)
print(critic_out)
critic.compute_return(critic_out, target)
I know there are many similar topics discussed on StackOverflow, but I have done quite a lot research both in StackOverflow and on the Internet and I couldn't find a solution.
I am trying to implement the classic Deep Q Learning Algorithm to solve the openAI gym's cartpole game:
OpenAI Gym Cartpole
Firstly, I created an agent that generates random weights. The results are shown in the graph below:
Amazingly, the agent managed to reach 200 steps (which is the max) in many episodes by simply generating 4 random uniform weights [w1, w2, w3, w4] from (-1.0 to 1.0) in each episode.
So, i decided to implement a simple DQN with only 4 weights and 2 biases and to make the agent learn this game over the time. The weights will be initialized randomly in the beginning and Back-Propagation will be used to update them as the agent makes steps.
I used the Epsilon Greedy strategy to make the agent explore at the beginning and exploit the Q values later on. However, The results are disappointing compared to the random agent:
I have tried to tune a lot of parameters and different architectures and the result doesn't change as much. So, my question is the following:
Question:
Did i make a wrong implementation of DQN or a simple DQN cannot beat the cartpole? What's your experience? It does reduces the loss (Error), but it doesn't guarantee a good solution.
Thanks in advance.
import tensorflow as tf
import gym
import numpy as np
import random as rand
import matplotlib.pyplot as plt
# Cartpole's Observation:
# 4 Inputs
# 2 Actions (LEFT | RIGHT)
input_size = 4
output_size = 2
# Deep Q Network Class
class DQN:
def __init__(self, var_names):
self.var_names = var_names
self._define_placeholders()
self._add_layers()
self._define_loss()
self._choose_optimizer()
self._initialize()
# Placeholders:
# Inputs: The place where we feed the Observations (States).
# Targets: Q_target = R + gamma*Q(s', a*).
def _define_placeholders(self):
self.inputs = tf.placeholder(tf.float32, shape=(None, input_size), name='inputs')
self.targets = tf.placeholder( tf.float32, shape=(None, output_size), name='targets')
# Layers:
# 4 Input Weights.
# 2 Biases.
# output = softmax(inputs*weights + biases).
# Weights and biases are initialized randomly.
def _add_layers(self):
w = tf.get_variable(name=self.var_names[0], shape=(input_size, output_size),
initializer=tf.initializers.random_uniform(minval=-1.0, maxval=1.0) )
b = tf.get_variable(name=self.var_names[1], shape=(output_size),
initializer=tf.initializers.random_uniform(minval=-1.0, maxval=1.0) )
self.outputs = tf.nn.softmax(tf.matmul(self.inputs, w) + b)
self.prediction = tf.argmax(self.outputs, 1)
# Loss = MSE.
def _define_loss(self):
self.mean_loss = tf.losses.mean_squared_error(labels=self.targets, predictions=self.outputs) / 2
# AdamOptimizer with starting learning rate: a = 0.005.
def _choose_optimizer(self):
self.optimizer = tf.train.AdamOptimizer(learning_rate=0.005).minimize(loss=self.mean_loss)
# Initializes the dqn's weights.
def _initialize(self):
initializer = tf.global_variables_initializer()
self.sess = tf.InteractiveSession()
self.sess.run(initializer)
# Get's current's DQN weights.
def get_weights(self):
return [ self.sess.run( tf.trainable_variables(var) )[0] for var in self.var_names ]
# Updates the weights of DQN.
def update_weights(self, new_weights):
variables = [tf.trainable_variables(name)[0] for name in self.var_names]
update = [ tf.assign(var, weight) for (var, weight) in zip(variables, new_weights) ]
self.sess.run(update)
# Predicts the best possible action from a state s.
# a* = argmax( Q(s) )
# Returns from Q(s), a*
def predict(self, states):
Q, actions = self.sess.run( [self.outputs, self.prediction],
feed_dict={self.inputs: states} )
return Q, actions
# It partially fits the given observations and the targets into the network.
def partial_fit(self, states, targets):
_, loss = self.sess.run( [self.optimizer, self.mean_loss],
feed_dict={self.inputs: states, self.targets: targets} )
return loss
# Replay Memory Buffer
# It stores experiences as (s,a,r,s') --> (State, Action, Reward, Next_Action).
# It generates random mini-batches of experiences from the memory.
# If the memory is full, then it deletes the oldest experiences. Experience is an step.
class ReplayMemory:
def __init__(self, mem_size):
self.mem_size = mem_size
self.experiences = []
def add_experience(self, xp):
self.experiences.append(xp)
if len(self.experiences) > self.mem_size:
self.experiences.pop(0)
def random_batch(self, batch_size):
if len(self.experiences) < batch_size:
return self.experiences
else:
return rand.sample(self.experiences, batch_size)
# The agent's class.
# It contains 2 DQNs: Online DQN for Predictions and Target DQN for the targets.
class Agent:
def __init__(self, epsilon, epsilon_decay, min_epsilon, gamma, mem_size):
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.min_epsilon = min_epsilon
self.gamma = gamma
self.replay_mem = ReplayMemory(mem_size)
self.online_dqn = DQN( var_names=['online_w', 'online_b'] )
self.target_dqn = DQN( var_names=['target_w', 'target_b'] )
self.state = None
def set_epsilon(self, epsilon):
self.epsilon = epsilon
def reduce_epsilon(self):
if self.epsilon > self.min_epsilon:
self.epsilon -= self.epsilon_decay
def update_state(self, state):
self.state = state
def update_memory(self, state, action, reward, next_state):
experience = (state, action, reward, next_state)
self.replay_mem.add_experience(experience)
# It updates the target network after N steps.
def update_network(self):
self.target_dqn.update_weights( self.online_dqn.get_weights() )
# Randomly chooses an action from the enviroment.
def explore(self, env):
action = env.action_space.sample()
return action
# Predicts and chooses the best possible moves from the current state.
def exploit(self):
_, action = self.online_dqn.predict(self.state)
return action[0]
# Uses Epsilon-Greedy to decide whether to explore or exploit.
# Epsilon starts with 1 and is reduced over the time.
# After the agent makes a move, he returns: state, action, reward, next_state.
def take_action(self, env):
action = None
p = rand.uniform(0.0, 1.0)
if p < self.epsilon:
action = self.explore(env)
else:
action = self.exploit()
next_state, reward, done, _ = env.step(action)
if done:
next_state = None
else:
next_state = np.reshape( next_state, (1, input_size) )
return self.state, action, reward, next_state, done
# Trains the agent.
# A random mini-batch is generated from the memory.
# We feed each experience into the DQN.
# For each
# Q(s) = Qtarget(s)
# Q(s'), a* = Qtarget(s'), argmax Q(s')
# We set targets = Q(s')
# For each action (a), reward (r), next_state (s') in the batch:
# If s' is None the GameOver. So, we set target[i] = Reward
# If s' != None, then target[i][a] = r + gamma*Q(s', 'a')
# Then, the online DQN calculates the mean squared difference of r + gamma*Q(s', 'a') - Q(s, a)
# and uses Back-Propagation to update the weights.
def train(self):
mini_batch = self.replay_mem.random_batch(batch_size=256)
batch_size = len(mini_batch)
states = np.zeros( shape=(batch_size, input_size) )
next_states = np.zeros( shape=(batch_size, input_size) )
for i in range(batch_size):
states[i] = mini_batch[i][0]
next_states[i] = mini_batch[i][3]
Q, _ = self.target_dqn.predict(states)
next_Q, next_actions = self.target_dqn.predict(next_states)
targets = Q
for i in range(batch_size):
action = mini_batch[i][1]
reward = mini_batch[i][2]
next_state = mini_batch[i][3]
if next_state is None:
targets[i][action] = reward
else:
targets[i][action] = reward + self.gamma * next_Q[i][ next_actions[i] ]
loss = self.online_dqn.partial_fit(states, targets)
return loss
def play(agent, env, episodes, N, render=False, train=True):
ep = 0
episode_steps = []
steps = 0
total_steps = 0
loss = 0
# Sets the current state as the initial.
# Cartpole spawns the agent in a random state.
agent.update_state( np.reshape( env.reset(), (1, input_size) ) )
agent.update_network()
while ep < episodes:
if render:
env.render()
# The target DQN's weights are frozen.
# The agent Updates the Target DQN's Weights after 100 steps.
if train and total_steps % N == 0:
agent.update_network()
print('---Target network updated---')
# Takes action.
state, action, reward, next_state, done = agent.take_action(env)
# Updates the memory and the current state.
agent.update_memory(state, action, reward, next_state)
agent.update_state(next_state)
steps += 1
total_steps += 1
if train:
loss = agent.train()
if done:
agent.update_state( np.reshape( env.reset(), (1, input_size) ) )
episode_steps.append(steps)
ep += 1
if train:
agent.reduce_epsilon()
print('End of episode', ep, 'Training loss =', loss, 'Steps =', steps)
steps = 0
if render:
env.close()
return episode_steps
env = gym.make('CartPole-v0')
# Training the agent.
agent = Agent(epsilon=1, epsilon_decay = 0.01, min_epsilon = 0.05, gamma=0.9, mem_size=50000)
episodes = 1000
N = 100
episode_steps = play(agent, env, episodes, N)
# Plotting the results.
# After the training is done, the steps should be maximized (up to 200)
plt.plot(episode_steps)
plt.show()
# Testing the agent.
agent.set_epsilon(0)
episodes = 1
steps = play(agent, env, episodes, N, render=True, train=False)[0]
print('\nSteps =', steps)
The algorithm works quite well. When I decided to plot the data, I used as a metric:
Rewards / Episode
Most of Deep Reinforcement Learning Frameworks (e.g. tf-agents) use mean reward (e.g. mean reward per 10 episodes) and this is why the plots look so smooth. If You look at the above plot, The agent manages to get a high score most of the time.
Also, I have decided to improve the speed of the algorithm using numpy operations rather than "for" loops. You can check out my implementation here:
https://github.com/kochlisGit/Deep-Reinforcement-Learning/tree/master/Custom%20DQN
I'm trying to train a model, and it doesn't work because weights aren't updating when I call the following:
self.optimizer = Adam(self.PPO.parameters(), lr=0.1, eps=epsilon)
total_loss = Variable(policy_loss + 0.5*value_loss - entropy_loss.mean() * 0.01, requires_grad=True)
self.optimizer.zero_grad()
(total_loss * 10).backward()
self.optimizer.step()
When I print the weights, they're all the same (loss isn't zero, and learning rate set to 0.1), and when I compare them (even with clone() called on each param) it always returns True. Total loss has a grad_fn attribute too... The optimizer is created in the constructor of my agent class.
My code is based on this repository:
https://github.com/andreiliphd/tennis-ppo/blob/master/agent.py
This is my agent constructor:
def __init__(self, PPO, learning_rate, epsilon, discount_rate, entropy_coefficient, ppo_clip, gradient_clip,
rollout_length, tau):
self.PPO = PPO
self.learning_rate = learning_rate
self.epsilon = epsilon
self.discount_rate = discount_rate
self.entropy_coefficient = entropy_coefficient
self.ppo_clip = 0.2
self.gradient_clip = 5
self.rollout_length = rollout_length
self.tau = tau
self.optimizer = Adam(self.PPO.actor.parameters(), lr=0.1, eps=epsilon)
self.device = torch.device('cpu')
This is my PPO class, which creates two networks with a forward function, and some hidden layers
class PPO(nn.Module):
def __init__(self, state_shape, action_num, mlp_layers, device=torch.device('cpu')):
super(PPO, self).__init__()
self.state_shape = state_shape
self.action_num = action_num
self.mlp_layers = mlp_layers
self.device = torch.device('cpu')
layer_dims = [np.prod(self.state_shape)] + self.mlp_layers
self.actor = PPO_Network(state_shape, action_num, layer_dims, True)
self.actor = self.actor.to(device)
self.critic = PPO_Network(state_shape, 1, layer_dims, False)
self.critic = self.critic.to(device)
self.to(device)
Any indications on why this is happening, and what I am overlooking are very welcome. :)
I can give more info or code if needed.
I fixed this :)
It was very stupid, it was because I was converted one of the values returned by my networks to numpy, and then converted it back to a tensor. It took me a while to realize because of some messy code.
Short Description of my model
I am trying to write my own DQN algorithm in Python, using Tensorflow following the paper(Mnih et al., 2015). In train_DQN function, I have defined the training procedure, and DQN_CartPole is for defining the function approximation(simple 3-layered Neural Network). For loss function, Huber loss or MSE is implemented followed by the gradient clipping(between -1 and 1). Then, I have implemented soft-update method instead of hard-update of the target network by copying the weights in the main network.
Question
I am trying it on the CartPole environment(OpenAI gym), but the rewards does not improve as it does in other people's algorithms, such as keras-rl. Any help will be appreciated.
reward over timestep
If possible, could you have a look at the source code?
DQN model: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_model.py
Training Script: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_train.py
Reddit post: https://www.reddit.com/r/reinforcementlearning/comments/ba7o55/question_dqn_algorithm_does_not_work_well_on/?utm_source=share&utm_medium=web2x
class Parameters:
def __init__(self, mode=None):
assert mode != None
print("Loading Params for {} Environment".format(mode))
if mode == "Atari":
self.state_reshape = (1, 84, 84, 1)
self.num_frames = 1000000
self.memory_size = 10000
self.learning_start = 10000
self.sync_freq = 1000
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 1000
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
elif mode == "CartPole":
self.state_reshape = (1, 4)
self.num_frames = 10000
self.memory_size = 20000
self.learning_start = 100
self.sync_freq = 100
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 500
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
class _DQN:
"""
Boilerplate for DQN Agent
"""
def __init__(self):
"""
define the deep learning model here!
"""
pass
def predict(self, sess, state):
"""
predict q-values given a state
:param sess:
:param state:
:return:
"""
return sess.run(self.pred, feed_dict={self.state: state})
def update(self, sess, state, action, Y):
feed_dict = {self.state: state, self.action: action, self.Y: Y}
_, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
# print(action, Y, sess.run(self.idx_flattened, feed_dict=feed_dict))
return loss
class DQN_CartPole(_DQN):
"""
DQN Agent for CartPole game
"""
def __init__(self, scope, env, loss_fn ="MSE"):
self.scope = scope
self.num_action = env.action_space.n
with tf.variable_scope(scope):
self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32, name="X")
self.Y = tf.placeholder(shape=[None], dtype=tf.float32, name="Y")
self.action = tf.placeholder(shape=[None], dtype=tf.int32, name="action")
fc1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(self.state)
fc2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc1)
fc3 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc2)
self.pred = tf.keras.layers.Dense(self.num_action, activation=tf.nn.relu)(fc3)
# indices of the executed actions
self.idx_flattened = tf.range(0, tf.shape(self.pred)[0]) * tf.shape(self.pred)[1] + self.action
# passing [-1] to tf.reshape means flatten the array
# using tf.gather, associate Q-values with the executed actions
self.action_probs = tf.gather(tf.reshape(self.pred, [-1]), self.idx_flattened)
if loss_fn == "huber_loss":
# use huber loss
self.losses = tf.subtract(self.Y, self.action_probs)
self.loss = huber_loss(self.losses)
elif loss_fn == "MSE":
# use MSE
self.losses = tf.squared_difference(self.Y, self.action_probs)
self.loss = tf.reduce_mean(self.losses)
else:
assert False
# you can choose whatever you want for the optimiser
# self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
self.optimizer = tf.train.AdamOptimizer()
# to apply Gradient Clipping, we have to directly operate on the optimiser
# check this: https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#processing_gradients_before_applying_them
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
self.clipped_grads_and_vars = [(ClipIfNotNone(grad, -1., 1.), var) for grad, var in self.grads_and_vars]
self.train_op = self.optimizer.apply_gradients(self.clipped_grads_and_vars)
def train_DQN(main_model, target_model, env, replay_buffer, policy, params):
"""
Train DQN agent which defined above
:param main_model:
:param target_model:
:param env:
:param params:
:return:
"""
# log purpose
losses, all_rewards, cnt_action = [], [], []
episode_reward, index_episode = 0, 0
with tf.Session() as sess:
# initialise all variables used in the model
sess.run(tf.global_variables_initializer())
state = env.reset()
start = time.time()
for frame_idx in range(1, params.num_frames + 1):
action = policy.select_action(sess, target_model, state.reshape(params.state_reshape))
cnt_action.append(action)
next_state, reward, done, _ = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
if done:
index_episode += 1
state = env.reset()
all_rewards.append(episode_reward)
if frame_idx > params.learning_start and len(replay_buffer) > params.batch_size:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = target_model.predict(sess, next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = main_model.update(sess, states, actions, Y)
# Logging and refreshing log purpose values
losses.append(np.mean(loss))
logging(frame_idx, params.num_frames, index_episode, time.time()-start, episode_reward, np.mean(loss), cnt_action)
episode_reward = 0
cnt_action = []
start = time.time()
if frame_idx > params.learning_start and frame_idx % params.sync_freq == 0:
# soft update means we partially add the original weights of target model instead of completely
# sharing the weights among main and target models
if params.update_hard_or_soft == "hard":
sync_main_target(sess, main_model, target_model)
elif params.update_hard_or_soft == "soft":
soft_target_model_update(sess, main_model, target_model, tau=params.soft_update_tau)
return all_rewards, losses
Modification
dones -> np.logical_not(dones)
np.argmax -> np.max
separating MSE from huber_loss
Briefly looking over, it seems that the dones variable is a binary vector where 1 denotes done, and 0 denotes not-done.
You then use dones here:
Y = rewards + params.gamma * np.argmax(next_Q, axis=1) * dones
So for all terminating transitions, you add the expected cumulative reward when following the policy for the rest of the episode (which is zero). For all non-terminating transitions, you do not add the expect cumulative reward.
I think you mean to do this the other way around, perhaps swap dones in the above line of code with np.logical_not(dones)?
Also, now that I look at it, it seems there is another major problem with this line. np.argmax(next_Q, axis=1) returns the index of the maximum value in next_Q vector, not the actual maximum value. You need np.maximum(next_Q, axis=1) (IIRC) to get the maximum expected reward of the next state's actions.
EDIT: The loss function is also very strangely defined. You are mixing Huber Loss with Mean-Squared-Error. If you want to use either huber_loss or MSE, you just compute them on the difference between the expected and predicted values. You appear to be doing both, which is certainly not a commonly defined loss function. For example, your model loss to use Huber Loss should just be:
self.loss = tf.reduce_mean(huber_loss(abs(self.Y - self.action_probs)))
I have been using TensorFlow for a reasonable length of time now. and believed I had a thorough understanding of how a TensorFlow graph works and executes within a session. However, I have written all of my TensorFlow models in a script-like fashion as such:
import tensorflow as tf
import DataWorker
import Constants
x = tf.placeholder(tf.float32, [None, Constants.sequenceLength, DataWorker.numFeatures])
y = tf.placeholder(tf.float32, [None, 1])
xTensors = tf.unstack(x, axis=1) # [seqLength tensors of shape (batchSize, numFeatures)]
W = tf.Variable(tf.random_normal([Constants.numHidden, 1])) # Weighted matrix
b = tf.Variable(tf.random_normal([1])) # Bias
cell = tf.contrib.rnn.BasicLSTMCell(Constants.numHidden, forget_bias=Constants.forgetBias)
outputs, finalState = tf.nn.static_rnn(cell, xTensors, dtype=tf.float32)
# predictions = [tf.add(tf.matmul(output, W), b) for output in outputs] # List of predictions after each time step
prediction = tf.add(tf.matmul(outputs[-1], W), b) # Prediction after final time step
prediction = tf.tanh(prediction) # Activation
mse = tf.losses.mean_squared_error(predictions=prediction, labels=y) # Mean loss over entire batch
accuracy = tf.reduce_mean(1 - (tf.abs(y - prediction) / DataWorker.labelRange)) # Accuracy over entire batch
optimiser = tf.train.AdamOptimizer(Constants.learningRate).minimize(mse) # Backpropagation
with tf.Session() as session:
session.run(tf.global_variables_initializer())
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
dict = {x: batchX, y: batchY}
session.run(optimiser, dict)
if batchNum % 1000 == 0 or epochComplete:
batchLoss = session.run(mse, dict)
batchAccuracy = session.run(accuracy, dict)
print("Iteration:", batchNum)
print(batchLoss)
print(str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
testAccuracy = session.run(accuracy, {x: testX, y: testY})
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
But now, for practicality and readability, I want to implement my model as a class, but have encountered many problems with initializing my variables, etc.
This is the closest I have got to implementing the above example using my own LSTM class
Model.py
import tensorflow as tf
import Constants
import DataWorker # Remove this dependency
class LSTM():
"""docstring."""
def __init__(self,
inputDimensionList,
outputDimensionList,
numLayers=Constants.numLayers,
numHidden=Constants.numHidden,
learningRate=Constants.learningRate,
forgetBias=Constants.forgetBias
):
"""docstring."""
self.batchInputs = tf.placeholder(tf.float32, [None] + inputDimensionList)
self.batchLabels = tf.placeholder(tf.float32, [None] + outputDimensionList)
self.weightedMatrix = tf.Variable(tf.random_normal([numHidden] + outputDimensionList))
self.biasMatrix = tf.Variable(tf.random_normal(outputDimensionList))
self.cell = tf.contrib.rnn.BasicLSTMCell(numHidden, forget_bias=forgetBias)
self.numLayers = numLayers
self.numHidden = numHidden
self.learningRate = learningRate
self.forgetBias = forgetBias
self.batchDict = {}
self.batchInputTensors = None
self.batchOutputs = None # All needed as instance variables?
self.batchFinalStates = None
self.batchPredictions = None
self.batchLoss = None
self.batchAccuracy = None
self.initialised = False
self.session = tf.Session()
# Take in activation, loss and optimiser FUNCTIONS as args
def execute(self, command):
"""docstring."""
return self.session.run(command, self.batchDict)
def setBatchDict(self, inputs, labels):
"""docstring."""
self.batchDict = {self.batchInputs: inputs, self.batchLabels: labels}
self.batchInputTensors = tf.unstack(self.batchInputs, axis=1)
def processBatch(self):
"""docstring."""
self.batchOutputs, self.batchFinalState = tf.nn.static_rnn(self.cell, self.batchInputTensors, dtype=tf.float32)
pred = tf.tanh(tf.add(tf.matmul(self.batchOutputs[-1], self.weightedMatrix), self.biasMatrix))
mse = tf.losses.mean_squared_error(predictions=pred, labels=self.batchLabels)
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
if not self.initialised:
self.session.run(tf.global_variables_initializer())
self.initialised = True
with tf.variable_scope("model") as scope:
if self.initialised:
scope.reuse_variables()
self.execute(optimiser)
self.batchPredictions = self.execute(pred)
self.batchLoss = self.execute(tf.losses.mean_squared_error(predictions=self.batchPredictions, labels=self.batchLabels))
self.batchAccuracy = self.execute(tf.reduce_mean(1 - (tf.abs(self.batchLabels - self.batchPredictions) / DataWorker.labelRange)))
return self.batchPredictions, self.batchLabels, self.batchLoss, self.batchAccuracy
def kill(self):
"""docstring."""
self.session.close()
This class is quite messy, especially processBatch() as I have just been trying to get it to work before refining it.
I then run my model here:
Main.py
import DataWorker
import Constants
from Model import LSTM
inputDim = [Constants.sequenceLength, DataWorker.numFeatures]
outputDim = [1]
lstm = LSTM(inputDimensionList=inputDim, outputDimensionList=outputDim)
# #############################################
# TRAINING
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0 # Pointers to current ID and timestamp
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer, isTraining=True)
lstm.setBatchDict(batchX, batchY)
batchPredictions, batchLabels, batchLoss, batchAccuracy = lstm.runBatch()
if batchNum % 1000 == 0 or epochComplete:
print("Iteration:", batchNum)
print("Pred:", batchPredictions[-1], "\tLabel:", batchLabels[-1])
print("Loss:", batchLoss)
print("Accuracy:", str("%.2f" % (batchAccuracy * 100) + "%\n"))
# #############################################
# TESTING
# #############################################
testX, testY, _, _, _ = DataWorker.generateBatch(0, 0, isTraining=False)
lstm.setBatchDict(testX, testY)
_, _, _, testAccuracy = lstm.runBatch()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
lstm.kill()
A single passthrough of the graph is executed fine, when all the variables are initialized, but it is on the second iteration where I get the error
ValueError: Variable rnn/basic_lstm_cell/kernel/Adam/ already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:
optimiser = tf.train.AdamOptimizer(self.learningRate).minimize(mse)
I Googled this problem and learned that using scope.reuse_variables() should stop it trying to initialize the AdamOptimizer a second time, but cleary this isn't working how I have implemented it. How can I fix this issue?
As a side note, is my method of creating the TensorFlow session as an instance variable within my LSTM class acceptable, or should I create the session in Main and then pass it into the LSTM instance?
In general I wrap anything that creates variables under the hood with tf.make_template when doing object oriented model building.
However, you should avoid adding ops to the graph in a training loop, which looks like it's happening here. They will build up and cause problems, and likely give you incorrect results. Instead, define the graph (with inputs from tf.data, placeholders, or queues) and only loop over a session.run call. Even better, structure your code as an Estimator and this will be enforced.