I am trying to code my own DQN in Python, using pytorch. I am trying it on the CartPole environment.
Although the Q-loss converaged, the model performed poorly.
Replay buffer was also used in the model with a size of 2000 and the double networks were also used.I updated the target network when the q_net network was updated 100 times.(ps: q_net was used to decide which action to choose.)
I tried to use different network architectures and different combinations of hyper-parameters, but the model still performed poorly, which just kept swaying and never kept itself balanced.
Appreciate for your help, sincerely!!
Here is the figure of Q-loss result:
Q-loss figure
a closer look
Here is the code for action taking.
def take_action(self,state):
if self.learn_count<=10:
self.eposilon = 0.1
else:
self.eposilon = 0.9
decision = np.random.choice([0,1],p = [1-self.eposilon,self.eposilon])
if decision == 1:
#final_decision = self.q_net(state.cuda().detach()).argmax()
final_decision = self.q_net(torch.Tensor(state).to(self.device))
final_decision = torch.max(final_decision, 0)[1].data.numpy()
else:
final_decision = np.random.choice([i for i in range(self.action_space)])
return final_decision.item()
Here is the code for training the network.
def update_nn(self):
if self.learn_count%100 ==0:
self.synchronous_NN()
for i in range(self.num_epoches):
self.learn_count = self.learn_count+1
self.training_nn()
return None
def training_nn(self):
index = random.sample(range(self.replay_buffer.shape[0]),self.minibatch_size)
chosen_sample = self.replay_buffer[index,:]
last_state = copy.deepcopy(chosen_sample[np.isnan(chosen_sample[:,-1:]).squeeze(),:])
not_last_state = copy.deepcopy(chosen_sample[~np.isnan(chosen_sample[:,-1:]).squeeze(),:])
input_not_last_state = torch.FloatTensor(not_last_state[:,:4]).to(self.device)
action_index = torch.LongTensor(not_last_state[:,4].reshape(-1,1)).to(self.device)
action_value = self.q_net(input_not_last_state).gather(1,action_index)
max_action_value = not_last_state[:,5]+self.gamma*self.fixed_q_net(input_not_last_state).detach().max(1).values.numpy()
last_state = np.nan_to_num(last_state)
input_last_state = torch.FloatTensor(last_state[:,:4]).to(self.device)
last_action_index = torch.LongTensor(last_state[:,4].reshape(-1,1)).to(self.device)
last_action_value = self.q_net(input_last_state).gather(1,last_action_index)
last_max_action_value = last_state[:,5]
X = torch.cat([action_value,last_action_value])
y = torch.FloatTensor(np.hstack([max_action_value,last_max_action_value]).reshape(-1,1)).detach()
loss = self.loss(X, y)
self.optimizer.zero_grad() # reset the gradient to zero
loss.backward()
self.optimizer.step() # execute back propagation for one step
self.loss_curve.append(loss)
return None
here is the part of playing:
def start_to_play(self):
agent = Agent()
agent.initial_Q_network(2)
agent.initial_replay_buffer()
self.env = gym.make('CartPole-v0')
self.env = self.env.unwrapped
for i in range(self.episode):
if i%50 ==1:
agent.save_model(i)
step = 0
state = self.env.reset()
ep_r = 0
while(True):
action = agent.take_action(state)
observation, reward, done, info = self.env.step(action)
self.env.render()
#next_state = self.capture_state()
next_state = observation
x, x_dot, theta, theta_dot = observation
r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8
r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5
reward = r1 + r2
ep_r = reward+ep_r
if done:
reward = reward-20
state1_np = np.array(state)
state2_np = np.array([np.nan,np.nan,np.nan,np.nan])
agent.add_replay_buffer(np.hstack([state1_np,np.array([action,reward]),state2_np]))
if agent.replay_buffer.shape[0]>300:
agent.update_nn()
print(i,step,round(ep_r, 2))
break
else:
state1_np = np.array(state)
state2_np = np.array(next_state)
agent.add_replay_buffer(np.hstack([state1_np,np.array([action,reward]),state2_np]))
if agent.replay_buffer.shape[0]>300:
agent.update_nn()
state = next_state
step = step+1
self.plot_curve(agent)
return None
Thanks for your time!!
Loss in reinforcement learning is not very important. In fact, you could see very good agents with good performances but bad results in term of neural network losses.
I suggest you to look into other implementations hyperparameters, since CartPole has been solved an infinite amount of times with multiple algorithms.
The first thing that might be wrong is the experience replay buffer, which is too small. But then again, look into other DQN implementation for CartPole and try to use the same hyperparameters that they are using
Related
I'm trying to implement a DQN. As a warm up I want to solve CartPole-v0 with a MLP consisting of two hidden layers along with input and output layers. The input is a 4 element array [cart position, cart velocity, pole angle, pole angular velocity] and output is an action value for each action (left or right). I am not exactly implementing a DQN from the "Playing Atari with DRL" paper (no frame stacking for inputs etc). I also made a few non standard choices like putting done and the target network prediction of action value in the experience replay, but those choices shouldn't affect learning.
In any case I'm having a lot of trouble getting the thing to work. No matter how long I train the agent it keeps predicting a higher value for one action over another, for example Q(s, Right)> Q(s, Left) for all states s. Below is my learning code, my network definition, and some results I get from training
class DQN:
def __init__(self, env, steps_per_episode=200):
self.env = env
self.agent_network = MlpPolicy(self.env)
self.target_network = MlpPolicy(self.env)
self.target_network.load_state_dict(self.agent_network.state_dict())
self.target_network.eval()
self.optimizer = torch.optim.RMSprop(
self.agent_network.parameters(), lr=0.005, momentum=0.95
)
self.replay_memory = ReplayMemory()
self.gamma = 0.99
self.steps_per_episode = steps_per_episode
self.random_policy_stop = 1000
self.start_learning_time = 1000
self.batch_size = 32
def learn(self, episodes):
time = 0
for episode in tqdm(range(episodes)):
state = self.env.reset()
for step in range(self.steps_per_episode):
if time < self.random_policy_stop:
action = self.env.action_space.sample()
else:
action = select_action(self.env, time, state, self.agent_network)
new_state, reward, done, _ = self.env.step(action)
target_value_pred = predict_target_value(
new_state, reward, done, self.target_network, self.gamma
)
experience = Experience(
state, action, reward, new_state, done, target_value_pred
)
self.replay_memory.append(experience)
if time > self.start_learning_time: # learning step
experience_batch = self.replay_memory.sample(self.batch_size)
target_preds = extract_value_predictions(experience_batch)
agent_preds = agent_batch_preds(
experience_batch, self.agent_network
)
loss = torch.square(agent_preds - target_preds).sum()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if time % 1_000 == 0: # how frequently to update target net
self.target_network.load_state_dict(self.agent_network.state_dict())
self.target_network.eval()
state = new_state
time += 1
if done:
break
def agent_batch_preds(experience_batch: list, agent_network: MlpPolicy):
"""
Calculate the agent action value estimates using the old states and the
actual actions that the agent took at that step.
"""
old_states = extract_old_states(experience_batch)
actions = extract_actions(experience_batch)
agent_preds = agent_network(old_states)
experienced_action_values = agent_preds.index_select(1, actions).diag()
return experienced_action_values
def extract_actions(experience_batch: list) -> list:
"""
Extract the list of actions from experience replay batch and torchify
"""
actions = [exp.action for exp in experience_batch]
actions = torch.tensor(actions)
return actions
class MlpPolicy(nn.Module):
"""
This class implements the MLP which will be used as the Q network. I only
intend to solve classic control problems with this.
"""
def __init__(self, env):
super(MlpPolicy, self).__init__()
self.env = env
self.input_dim = self.env.observation_space.shape[0]
self.output_dim = self.env.action_space.n
self.fc1 = nn.Linear(self.input_dim, 32)
self.fc2 = nn.Linear(32, 128)
self.fc3 = nn.Linear(128, 32)
self.fc4 = nn.Linear(32, self.output_dim)
def forward(self, x):
if type(x) != torch.Tensor:
x = torch.tensor(x).float()
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
return x
Learning results:
Here I'm seeing one action always valued over the others (Q(right, s) > Q(left, s)). It's also clear that the network is predicting the same action values for every state.
Does anyone have an idea about what's going on? I've done a lot of debugging and careful reading of the original papers (also thought about "normalizing" the observation space even though the velocities can be infinite) and could be missing something obvious at this point. I can include more code for the helper functions if that would be useful.
There was nothing wrong with the network definition. It turns out the learning rate was too high and reducing it 0.00025 (as in the original Nature paper introducing the DQN) led to an agent which can solve CartPole-v0.
That said, the learning algorithm was incorrect. In particular I was using the wrong target action-value predictions. Note the algorithm laid out above does not use the most recent version of the target network to make predictions. This leads to poor results as training progresses because the agent is learning based on stale target data. The way to fix this is to just put (s, a, r, s', done) into the replay memory and then make target predictions using the most up to date version of the target network when sampling a mini batch. See the code below for an updated learning loop.
def learn(self, episodes):
time = 0
for episode in tqdm(range(episodes)):
state = self.env.reset()
for step in range(self.steps_per_episode):
if time < self.random_policy_stop:
action = self.env.action_space.sample()
else:
action = select_action(self.env, time, state, self.agent_network)
new_state, reward, done, _ = self.env.step(action)
experience = Experience(state, action, reward, new_state, done)
self.replay_memory.append(experience)
if time > self.start_learning_time: # learning step.
experience_batch = self.replay_memory.sample(self.batch_size)
target_preds = target_batch_preds(
experience_batch, self.target_network, self.gamma
)
agent_preds = agent_batch_preds(
experience_batch, self.agent_network
)
loss = torch.square(agent_preds - target_preds).sum()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if time % 1_000 == 0: # how frequently to update target net
self.target_network.load_state_dict(self.agent_network.state_dict())
self.target_network.eval()
state = new_state
time += 1
if done:
break
I know there are many similar topics discussed on StackOverflow, but I have done quite a lot research both in StackOverflow and on the Internet and I couldn't find a solution.
I am trying to implement the classic Deep Q Learning Algorithm to solve the openAI gym's cartpole game:
OpenAI Gym Cartpole
Firstly, I created an agent that generates random weights. The results are shown in the graph below:
Amazingly, the agent managed to reach 200 steps (which is the max) in many episodes by simply generating 4 random uniform weights [w1, w2, w3, w4] from (-1.0 to 1.0) in each episode.
So, i decided to implement a simple DQN with only 4 weights and 2 biases and to make the agent learn this game over the time. The weights will be initialized randomly in the beginning and Back-Propagation will be used to update them as the agent makes steps.
I used the Epsilon Greedy strategy to make the agent explore at the beginning and exploit the Q values later on. However, The results are disappointing compared to the random agent:
I have tried to tune a lot of parameters and different architectures and the result doesn't change as much. So, my question is the following:
Question:
Did i make a wrong implementation of DQN or a simple DQN cannot beat the cartpole? What's your experience? It does reduces the loss (Error), but it doesn't guarantee a good solution.
Thanks in advance.
import tensorflow as tf
import gym
import numpy as np
import random as rand
import matplotlib.pyplot as plt
# Cartpole's Observation:
# 4 Inputs
# 2 Actions (LEFT | RIGHT)
input_size = 4
output_size = 2
# Deep Q Network Class
class DQN:
def __init__(self, var_names):
self.var_names = var_names
self._define_placeholders()
self._add_layers()
self._define_loss()
self._choose_optimizer()
self._initialize()
# Placeholders:
# Inputs: The place where we feed the Observations (States).
# Targets: Q_target = R + gamma*Q(s', a*).
def _define_placeholders(self):
self.inputs = tf.placeholder(tf.float32, shape=(None, input_size), name='inputs')
self.targets = tf.placeholder( tf.float32, shape=(None, output_size), name='targets')
# Layers:
# 4 Input Weights.
# 2 Biases.
# output = softmax(inputs*weights + biases).
# Weights and biases are initialized randomly.
def _add_layers(self):
w = tf.get_variable(name=self.var_names[0], shape=(input_size, output_size),
initializer=tf.initializers.random_uniform(minval=-1.0, maxval=1.0) )
b = tf.get_variable(name=self.var_names[1], shape=(output_size),
initializer=tf.initializers.random_uniform(minval=-1.0, maxval=1.0) )
self.outputs = tf.nn.softmax(tf.matmul(self.inputs, w) + b)
self.prediction = tf.argmax(self.outputs, 1)
# Loss = MSE.
def _define_loss(self):
self.mean_loss = tf.losses.mean_squared_error(labels=self.targets, predictions=self.outputs) / 2
# AdamOptimizer with starting learning rate: a = 0.005.
def _choose_optimizer(self):
self.optimizer = tf.train.AdamOptimizer(learning_rate=0.005).minimize(loss=self.mean_loss)
# Initializes the dqn's weights.
def _initialize(self):
initializer = tf.global_variables_initializer()
self.sess = tf.InteractiveSession()
self.sess.run(initializer)
# Get's current's DQN weights.
def get_weights(self):
return [ self.sess.run( tf.trainable_variables(var) )[0] for var in self.var_names ]
# Updates the weights of DQN.
def update_weights(self, new_weights):
variables = [tf.trainable_variables(name)[0] for name in self.var_names]
update = [ tf.assign(var, weight) for (var, weight) in zip(variables, new_weights) ]
self.sess.run(update)
# Predicts the best possible action from a state s.
# a* = argmax( Q(s) )
# Returns from Q(s), a*
def predict(self, states):
Q, actions = self.sess.run( [self.outputs, self.prediction],
feed_dict={self.inputs: states} )
return Q, actions
# It partially fits the given observations and the targets into the network.
def partial_fit(self, states, targets):
_, loss = self.sess.run( [self.optimizer, self.mean_loss],
feed_dict={self.inputs: states, self.targets: targets} )
return loss
# Replay Memory Buffer
# It stores experiences as (s,a,r,s') --> (State, Action, Reward, Next_Action).
# It generates random mini-batches of experiences from the memory.
# If the memory is full, then it deletes the oldest experiences. Experience is an step.
class ReplayMemory:
def __init__(self, mem_size):
self.mem_size = mem_size
self.experiences = []
def add_experience(self, xp):
self.experiences.append(xp)
if len(self.experiences) > self.mem_size:
self.experiences.pop(0)
def random_batch(self, batch_size):
if len(self.experiences) < batch_size:
return self.experiences
else:
return rand.sample(self.experiences, batch_size)
# The agent's class.
# It contains 2 DQNs: Online DQN for Predictions and Target DQN for the targets.
class Agent:
def __init__(self, epsilon, epsilon_decay, min_epsilon, gamma, mem_size):
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.min_epsilon = min_epsilon
self.gamma = gamma
self.replay_mem = ReplayMemory(mem_size)
self.online_dqn = DQN( var_names=['online_w', 'online_b'] )
self.target_dqn = DQN( var_names=['target_w', 'target_b'] )
self.state = None
def set_epsilon(self, epsilon):
self.epsilon = epsilon
def reduce_epsilon(self):
if self.epsilon > self.min_epsilon:
self.epsilon -= self.epsilon_decay
def update_state(self, state):
self.state = state
def update_memory(self, state, action, reward, next_state):
experience = (state, action, reward, next_state)
self.replay_mem.add_experience(experience)
# It updates the target network after N steps.
def update_network(self):
self.target_dqn.update_weights( self.online_dqn.get_weights() )
# Randomly chooses an action from the enviroment.
def explore(self, env):
action = env.action_space.sample()
return action
# Predicts and chooses the best possible moves from the current state.
def exploit(self):
_, action = self.online_dqn.predict(self.state)
return action[0]
# Uses Epsilon-Greedy to decide whether to explore or exploit.
# Epsilon starts with 1 and is reduced over the time.
# After the agent makes a move, he returns: state, action, reward, next_state.
def take_action(self, env):
action = None
p = rand.uniform(0.0, 1.0)
if p < self.epsilon:
action = self.explore(env)
else:
action = self.exploit()
next_state, reward, done, _ = env.step(action)
if done:
next_state = None
else:
next_state = np.reshape( next_state, (1, input_size) )
return self.state, action, reward, next_state, done
# Trains the agent.
# A random mini-batch is generated from the memory.
# We feed each experience into the DQN.
# For each
# Q(s) = Qtarget(s)
# Q(s'), a* = Qtarget(s'), argmax Q(s')
# We set targets = Q(s')
# For each action (a), reward (r), next_state (s') in the batch:
# If s' is None the GameOver. So, we set target[i] = Reward
# If s' != None, then target[i][a] = r + gamma*Q(s', 'a')
# Then, the online DQN calculates the mean squared difference of r + gamma*Q(s', 'a') - Q(s, a)
# and uses Back-Propagation to update the weights.
def train(self):
mini_batch = self.replay_mem.random_batch(batch_size=256)
batch_size = len(mini_batch)
states = np.zeros( shape=(batch_size, input_size) )
next_states = np.zeros( shape=(batch_size, input_size) )
for i in range(batch_size):
states[i] = mini_batch[i][0]
next_states[i] = mini_batch[i][3]
Q, _ = self.target_dqn.predict(states)
next_Q, next_actions = self.target_dqn.predict(next_states)
targets = Q
for i in range(batch_size):
action = mini_batch[i][1]
reward = mini_batch[i][2]
next_state = mini_batch[i][3]
if next_state is None:
targets[i][action] = reward
else:
targets[i][action] = reward + self.gamma * next_Q[i][ next_actions[i] ]
loss = self.online_dqn.partial_fit(states, targets)
return loss
def play(agent, env, episodes, N, render=False, train=True):
ep = 0
episode_steps = []
steps = 0
total_steps = 0
loss = 0
# Sets the current state as the initial.
# Cartpole spawns the agent in a random state.
agent.update_state( np.reshape( env.reset(), (1, input_size) ) )
agent.update_network()
while ep < episodes:
if render:
env.render()
# The target DQN's weights are frozen.
# The agent Updates the Target DQN's Weights after 100 steps.
if train and total_steps % N == 0:
agent.update_network()
print('---Target network updated---')
# Takes action.
state, action, reward, next_state, done = agent.take_action(env)
# Updates the memory and the current state.
agent.update_memory(state, action, reward, next_state)
agent.update_state(next_state)
steps += 1
total_steps += 1
if train:
loss = agent.train()
if done:
agent.update_state( np.reshape( env.reset(), (1, input_size) ) )
episode_steps.append(steps)
ep += 1
if train:
agent.reduce_epsilon()
print('End of episode', ep, 'Training loss =', loss, 'Steps =', steps)
steps = 0
if render:
env.close()
return episode_steps
env = gym.make('CartPole-v0')
# Training the agent.
agent = Agent(epsilon=1, epsilon_decay = 0.01, min_epsilon = 0.05, gamma=0.9, mem_size=50000)
episodes = 1000
N = 100
episode_steps = play(agent, env, episodes, N)
# Plotting the results.
# After the training is done, the steps should be maximized (up to 200)
plt.plot(episode_steps)
plt.show()
# Testing the agent.
agent.set_epsilon(0)
episodes = 1
steps = play(agent, env, episodes, N, render=True, train=False)[0]
print('\nSteps =', steps)
The algorithm works quite well. When I decided to plot the data, I used as a metric:
Rewards / Episode
Most of Deep Reinforcement Learning Frameworks (e.g. tf-agents) use mean reward (e.g. mean reward per 10 episodes) and this is why the plots look so smooth. If You look at the above plot, The agent manages to get a high score most of the time.
Also, I have decided to improve the speed of the algorithm using numpy operations rather than "for" loops. You can check out my implementation here:
https://github.com/kochlisGit/Deep-Reinforcement-Learning/tree/master/Custom%20DQN
Short Description of my model
I am trying to write my own DQN algorithm in Python, using Tensorflow following the paper(Mnih et al., 2015). In train_DQN function, I have defined the training procedure, and DQN_CartPole is for defining the function approximation(simple 3-layered Neural Network). For loss function, Huber loss or MSE is implemented followed by the gradient clipping(between -1 and 1). Then, I have implemented soft-update method instead of hard-update of the target network by copying the weights in the main network.
Question
I am trying it on the CartPole environment(OpenAI gym), but the rewards does not improve as it does in other people's algorithms, such as keras-rl. Any help will be appreciated.
reward over timestep
If possible, could you have a look at the source code?
DQN model: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_model.py
Training Script: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_train.py
Reddit post: https://www.reddit.com/r/reinforcementlearning/comments/ba7o55/question_dqn_algorithm_does_not_work_well_on/?utm_source=share&utm_medium=web2x
class Parameters:
def __init__(self, mode=None):
assert mode != None
print("Loading Params for {} Environment".format(mode))
if mode == "Atari":
self.state_reshape = (1, 84, 84, 1)
self.num_frames = 1000000
self.memory_size = 10000
self.learning_start = 10000
self.sync_freq = 1000
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 1000
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
elif mode == "CartPole":
self.state_reshape = (1, 4)
self.num_frames = 10000
self.memory_size = 20000
self.learning_start = 100
self.sync_freq = 100
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 500
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
class _DQN:
"""
Boilerplate for DQN Agent
"""
def __init__(self):
"""
define the deep learning model here!
"""
pass
def predict(self, sess, state):
"""
predict q-values given a state
:param sess:
:param state:
:return:
"""
return sess.run(self.pred, feed_dict={self.state: state})
def update(self, sess, state, action, Y):
feed_dict = {self.state: state, self.action: action, self.Y: Y}
_, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
# print(action, Y, sess.run(self.idx_flattened, feed_dict=feed_dict))
return loss
class DQN_CartPole(_DQN):
"""
DQN Agent for CartPole game
"""
def __init__(self, scope, env, loss_fn ="MSE"):
self.scope = scope
self.num_action = env.action_space.n
with tf.variable_scope(scope):
self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32, name="X")
self.Y = tf.placeholder(shape=[None], dtype=tf.float32, name="Y")
self.action = tf.placeholder(shape=[None], dtype=tf.int32, name="action")
fc1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(self.state)
fc2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc1)
fc3 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc2)
self.pred = tf.keras.layers.Dense(self.num_action, activation=tf.nn.relu)(fc3)
# indices of the executed actions
self.idx_flattened = tf.range(0, tf.shape(self.pred)[0]) * tf.shape(self.pred)[1] + self.action
# passing [-1] to tf.reshape means flatten the array
# using tf.gather, associate Q-values with the executed actions
self.action_probs = tf.gather(tf.reshape(self.pred, [-1]), self.idx_flattened)
if loss_fn == "huber_loss":
# use huber loss
self.losses = tf.subtract(self.Y, self.action_probs)
self.loss = huber_loss(self.losses)
elif loss_fn == "MSE":
# use MSE
self.losses = tf.squared_difference(self.Y, self.action_probs)
self.loss = tf.reduce_mean(self.losses)
else:
assert False
# you can choose whatever you want for the optimiser
# self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
self.optimizer = tf.train.AdamOptimizer()
# to apply Gradient Clipping, we have to directly operate on the optimiser
# check this: https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#processing_gradients_before_applying_them
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
self.clipped_grads_and_vars = [(ClipIfNotNone(grad, -1., 1.), var) for grad, var in self.grads_and_vars]
self.train_op = self.optimizer.apply_gradients(self.clipped_grads_and_vars)
def train_DQN(main_model, target_model, env, replay_buffer, policy, params):
"""
Train DQN agent which defined above
:param main_model:
:param target_model:
:param env:
:param params:
:return:
"""
# log purpose
losses, all_rewards, cnt_action = [], [], []
episode_reward, index_episode = 0, 0
with tf.Session() as sess:
# initialise all variables used in the model
sess.run(tf.global_variables_initializer())
state = env.reset()
start = time.time()
for frame_idx in range(1, params.num_frames + 1):
action = policy.select_action(sess, target_model, state.reshape(params.state_reshape))
cnt_action.append(action)
next_state, reward, done, _ = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
if done:
index_episode += 1
state = env.reset()
all_rewards.append(episode_reward)
if frame_idx > params.learning_start and len(replay_buffer) > params.batch_size:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = target_model.predict(sess, next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = main_model.update(sess, states, actions, Y)
# Logging and refreshing log purpose values
losses.append(np.mean(loss))
logging(frame_idx, params.num_frames, index_episode, time.time()-start, episode_reward, np.mean(loss), cnt_action)
episode_reward = 0
cnt_action = []
start = time.time()
if frame_idx > params.learning_start and frame_idx % params.sync_freq == 0:
# soft update means we partially add the original weights of target model instead of completely
# sharing the weights among main and target models
if params.update_hard_or_soft == "hard":
sync_main_target(sess, main_model, target_model)
elif params.update_hard_or_soft == "soft":
soft_target_model_update(sess, main_model, target_model, tau=params.soft_update_tau)
return all_rewards, losses
Modification
dones -> np.logical_not(dones)
np.argmax -> np.max
separating MSE from huber_loss
Briefly looking over, it seems that the dones variable is a binary vector where 1 denotes done, and 0 denotes not-done.
You then use dones here:
Y = rewards + params.gamma * np.argmax(next_Q, axis=1) * dones
So for all terminating transitions, you add the expected cumulative reward when following the policy for the rest of the episode (which is zero). For all non-terminating transitions, you do not add the expect cumulative reward.
I think you mean to do this the other way around, perhaps swap dones in the above line of code with np.logical_not(dones)?
Also, now that I look at it, it seems there is another major problem with this line. np.argmax(next_Q, axis=1) returns the index of the maximum value in next_Q vector, not the actual maximum value. You need np.maximum(next_Q, axis=1) (IIRC) to get the maximum expected reward of the next state's actions.
EDIT: The loss function is also very strangely defined. You are mixing Huber Loss with Mean-Squared-Error. If you want to use either huber_loss or MSE, you just compute them on the difference between the expected and predicted values. You appear to be doing both, which is certainly not a commonly defined loss function. For example, your model loss to use Huber Loss should just be:
self.loss = tf.reduce_mean(huber_loss(abs(self.Y - self.action_probs)))
I tried to code a neural network to solve OpenAI's CartPole environment with Tensorflow and Keras. The network uses prioritized experience replay and a seperate target network which is updated every tenth episode. Here's the code:
import numpy as np
import gym
import matplotlib.pyplot as plt
from collections import deque
from tensorflow import keras
class agent:
def __init__(self,inputs,outputs,gamma):
self.gamma = gamma
self.epsilon = 1.0
self.epsilon_decay = 0.999
self.epsilon_min = 0.01
self.inputs = inputs
self.outputs = outputs
self.network = self._build_net()
self.target_network = self._build_net()
self.replay_memory = deque(maxlen=100000)
self.target_network.set_weights(self.network.get_weights())
def _build_net(self):
model = keras.models.Sequential()
model.add(keras.layers.Dense(10,activation='relu',input_dim=self.inputs))
model.add(keras.layers.Dense(10,activation='relu'))
model.add(keras.layers.Dense(10,activation='relu'))
model.add(keras.layers.Dense(self.outputs,activation='sigmoid'))
model.compile(optimizer='adam',loss='categorical_crossentropy')
return model
def act(self,state,testing=False):
if not testing:
if self.epsilon > np.random.rand():
return np.random.randint(self.outputs)
else:
return np.argmax(self.network.predict(np.array([state])))
else:
return np.argmax(self.network.predict(np.array([state])))
def remember(self,state,action,next_state,reward,win):
self.replay_memory.append((state,action,next_state,reward,win))
def get_batch(self):
batch = []
losses = []
targets = []
if len(self.replay_memory) >= 32:
for state,action,next_state,reward,done in self.replay_memory:
target_f = np.zeros([1,self.outputs])
if done != False:
target = (reward + self.gamma * np.amax(self.target_network.predict(np.array([next_state]))[0]))
target_f[0][action] = target
targets.append(target_f)
loss = np.mean(self.network.predict(np.array([state]))-target_f)**2
losses.append(loss)
indexes = np.argsort(losses)[:32]
for indx in indexes:
batch.append((self.replay_memory[indx][0], targets[indx]))
return batch
def replay(self,batch):
for state,target in batch:
self.network.fit(np.array([state]), target, epochs=1, verbose=0)
self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)
def update_target_network(self):
self.target_network.set_weights(self.network.get_weights())
def save(self):
self.network.save_weights('./DQN.model')
def load(self):
self.network.load_weights('./DQN.model')
env = gym.make('CartPole-v0')
episodes = 1000
agent = agent(env.observation_space.shape[0],env.action_space.n,0.99)
rewards = []
state = env.reset()
for _ in range(500):
action = agent.act(state,1.0)
next_state, reward, win, _ = env.step(action)
agent.remember(state,action,next_state,reward,win)
state = next_state
if win:
state = env.reset()
for e in range(episodes):
print('episode:',e+1)
batch = agent.get_batch()
state = env.reset()
for t in range(400):
action = agent.act(state)
next_state, reward, win, _ = env.step(action)
agent.remember(state,action,next_state,reward,win)
state = next_state
if win:
break
agent.replay(batch)
print('score:',t)
print('epsilon:',agent.epsilon)
print('')
if e%10 == 0:
agent.update_target_network()
agent.save()
rewards.append(t)
plt.plot(list(range(e+1)),rewards)
plt.savefig('./reward.png')
The problem is that the agent gets worse as epsilon decreases. And when epsilon is a its lowest value, the agent only gets through 7-9 steps before the Pole falls, as seen in the Image below. Can someone tell me why my agent isn't learning anything and how to fix it?
Your network output should be using linear as activation as opposed to sigmoid, since you are bootstrapping summation of all future rewards. The loss should be changed to mse as well, according to the derivation for the td error from the original paper.
Also, if the experience ended in a terminal state (done==True), the td target for the action taken should set equal to the reward (you are currently setting it as 0).
Try making the reward = -reward if done. It helps
I’ve been learning RL this summer and this week I’ve tried to make a PPO implementation on Pytorch with the help of some repositories from github with similiar algorithms.
The code runs OpenAI’s Lunar Lander but I have several errors that I have not been able to fix, the biggest one being that the algorithm quickly converges to doing the same action regardless of the state. The other major problem I’ve found is that even though I’m using backwards() only once, I get an error asking me to set retain_graph to True.
Because of that, I see no improvement of the rewards obtained over 1000 steps, I don’t know if the algorithm needs more steps to be able to see an improvement.
I’m really sorry if this kind of problems have no place in this forums, I just didn’t know where to post this.
Also I’m sorry for the messy code, it’s my first time doing this kind of algorithms, and I’m fairly new with pytorch and machine learning in general.
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.distributions import Categorical
import gym
class actorCritic(nn.Module):
def __init__(self):
super(actorCritic, self).__init__()
self.fc = nn.Sequential(
nn.Linear(8, 16),
nn.Linear(16, 32),
nn.Linear(32, 64),
nn.ReLU(inplace=True)
)
self.pi = nn.Linear(64, 4)
self.value = nn.Linear(64, 1)
def forward(self, x):
x = self.fc(x)
pi_1 = self.pi(x)
pi_out = F.softmax(pi_1, dim=-1)
value_out = self.value(x)
return pi_out, value_out
def GAE(rewards, values, masks):
gamma = 0.99
lamb = 0.95
advan_t = 0
sizes = rewards.size()
advantages = torch.zeros(1, sizes[1])
for t in reversed(range(sizes[1])):
delta = rewards[0][t] + gamma*values[0][t+1]*masks[0][t] - values[0][t]
advan_t = delta + gamma*lamb*advan_t*mask[0][t]
advantages[0][t] = advan_t
real_values = values[:,:sizes[1]] + advantages
return advantages, real_values
def plot_rewards(rewards):
plt.figure(2)
plt.clf()
plt.plot(rewards)
plt.pause(0.001)
plt.savefig('TruePPO 500 steps.png')
def interact(times, states):
rewards = torch.zeros(1, times)
actions = torch.zeros(1, times)
mask = torch.ones(1, times)
for steps in range(times):
action_probs, _ = network(states[steps])
m = Categorical(action_probs)
action = int(m.sample())
obs, reward, done, _ = env.step(action)
if done:
obs = env.reset()
mask[0][steps] = 0
states[steps+1] = torch.from_numpy(obs).float()
rewards[0][steps] = reward
actions[0][steps] = action
return states, rewards, actions, mask
#Parameters
total_steps = 1000
batch_size = 10
env = gym.make('LunarLander-v2')
network = actorCritic()
old_network = actorCritic()
optimizer = torch.optim.Adam(network.parameters(), lr = 0.001)
states = torch.zeros(batch_size+1, 8)
steps = 0
obs_ = env.reset()
obs = torch.from_numpy(obs_).float()
states[0] = obs
reward_means = []
nn_paramD = network.state_dict()
old_network.load_state_dict(nn_paramD)
while steps < total_steps:
print (steps)
states, rewards, actions, mask = interact(batch_size, states)
#calculate values, GAE, normalize advantages, randomize, calculate loss, backprop,
_, values = network(states)
values = values.view(-1, batch_size+1)
advantages, v_targ = GAE(rewards, values, mask)
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
optimizer.zero_grad()
for n in range(rewards.size()[1]):
probabilities, _ = network(states[n])
print (probabilities)
m = Categorical(probabilities)
action_prob = m.probs[int(actions[0][n])]
entropia = m.entropy()
old_probabilities, _ = old_network(states[n])
m_old = Categorical(old_probabilities)
old_action_prob = m.probs[actions[0][n].int()]
old_action_prob.detach()
ratio = action_prob / old_action_prob
surr1 = ratio*advantages[0][n]
surr2 = torch.clamp(ratio, min = (1.-0.2), max = (1.+0.2))
policy_loss = -torch.min(surr1, surr2)
value_loss = 0.5*(values[0][n]-v_targ[0][n])**2
entropy_loss = -entropia
total_loss = policy_loss + value_loss + 0.01*entropy_loss
total_loss.backward(retain_graph = True)
optimizer.step()
reward_means.append(rewards.numpy().mean())
old_network.load_state_dict(nn_paramD)
nn_paramD = network.state_dict()
steps += 1
plot_rewards(reward_means)