Problem getting DQN to learn CartPole-v1 (PyTorch)

Problem getting DQN to learn CartPole-v1 (PyTorch) - python

So I had my DQN training fine, solves the environment after ~65_000 iterations. However, I started working on something else and now it's completely broken and won't get to even close to the same level anymore.
Following advice from previous work, I tuned hyperparameters and still didn't see the same results anymore.
import gym
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from models import DQN
from memory import Memory
from utils import wrap_input, epsilon_greedy
def main() -> int:
env = gym.make("CartPole-v1")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Online and offline model for learning
model = DQN(env.observation_space, env.action_space, 24).to(device)
target = DQN(env.observation_space, env.action_space, 24).to(device)
target.eval()
# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=.001)
loss_fn = F.smooth_l1_loss
memory = Memory(10_000)
obs, info = env.reset()
for it in range(65_000):
# Do this for the batch norm
model.eval()
# Maybe explore
if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
state = wrap_input(obs, device).unsqueeze(0)
action = model(state).argmax().item()
else:
action = env.action_space.sample()
# Act in environment and store the memory
next_state, reward, done, truncated, info = env.step(action)
if truncated or done:
next_state = np.zeros(env.observation_space.shape)
memory.store([obs, action, reward, int(done), next_state])
done = done or truncated
if done:
obs, info = env.reset()
# Train
if len(memory) > 32:
model.train()
states, actions, rewards, dones, next_states = memory.sample(32)
# Wrap and move all values to the cpu
states = wrap_input(states, device)
actions = wrap_input(actions, device, torch.int64, reshape=True)
next_states = wrap_input(next_states, device)
rewards = wrap_input(rewards, device, reshape=True)
dones = wrap_input(dones, device, reshape=True)
# Get current q-values
qs = model(states)
qs = torch.gather(qs, dim=1, index=actions)
# Compute target q-values
with torch.no_grad():
next_qs, _ = target(next_states).max(dim=1)
next_qs = next_qs.reshape(-1, 1)
target_qs = rewards + .9 * (1 - dones) * next_qs.reshape(-1, 1)
# Compute loss
loss = loss_fn(qs, target_qs)
optimizer.zero_grad()
loss.backward()
# Clip gradients
nn.utils.clip_grad_norm_(model.parameters(), 1)
# Backprop
optimizer.step()
# soft update
with torch.no_grad():
for target_param, local_param in zip(target.parameters(), model.parameters()):
target_param.data.copy_(1e-2 * local_param.data + (1 - 1e-2) * target_param.data)
if it % 200 == 0:
target.load_state_dict(model.state_dict())
# models.py
class FlatExtractor(nn.Module):
'''Does nothing but pass the input on'''
def __init__(self, obs_space):
super(FlatExtractor, self).__init__()
self.n_flatten = obs_space.shape[0]
def forward(self, obs):
return obs
class DQN(nn.Module):
def __init__(self, obs_space, act_space, layer_size):
super(DQN, self).__init__()
# Feature extractor
if len(obs_space.shape) == 1:
self.feature_extractor = FlatExtractor(obs_space)
elif len(obs_space.shape) == 3:
self.feature_extractor = NatureCnn(obs_space)
else:
raise NotImplementedErorr("This type of environment is not supported")
# Neural network
self.net = nn.Sequential(
nn.Linear(self.feature_extractor.n_flatten, layer_size),
nn.BatchNorm1d(layer_size),
nn.ReLU(),
nn.Linear(layer_size, layer_size),
nn.BatchNorm1d(layer_size),
nn.ReLU(),
nn.Linear(layer_size, act_space.n),
)
def forward(self, obs):
return self.net(self.feature_extractor(obs))
# memory.py
import random
from collections import deque
class Memory(object):
def __init__(self, maxlen):
self.memory = deque(maxlen=maxlen)
def store(self, experience):
self.memory.append(experience)
def sample(self, n_samples):
return zip(*random.sample(self.memory, n_samples))
def __len__(self):
return len(self.memory)
# utils.py
def wrap_input(arr, device, dtype=torch.float, reshape=False):
output = torch.from_numpy(np.array(arr)).type(dtype).to(device)
if reshape:
output = output.reshape(-1, 1)
return output
def epsilon_greedy(start, end, n_steps, it):
return max(start - (start - end) * (it / n_steps), end)
Is there something that I'm greatly missing? I've tried training for longer it doesn't change. What seems to be the biggest problem is that the loss explodes, and even changing the tau for hard updates didn't seem to fix this problem.

I had a lot of difficulty getting your code to run, therefore I had to comment several things out. I also commented things that added unnecessary complexity while debugging, for instance, a simple environment like cartpole doesn't require a target network. Also, focus more on the total reward gained, instead of the loss.
A few major changes that I made were -
At the end of the iteration, the next_state should become the current_state -
obs = next_state
I swapped your explore and exploit code
if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
state = wrap_input(obs, device).unsqueeze(0)
action = model(state).argmax().item()
else:
action = env.action_space.sample()
Your code basically starts off exploiting by taking the argmax and once the epsilon value is low enough, it starts randomly sampling. This needs to be swapped.
I replaced it with -
if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
action = env.action_space.sample()
else:
state = wrap_input(obs, device).unsqueeze(0)
action = model(state).argmax().item()
I increased your batch size. A larger batch size in cartpole, speeds up training considerably -
states, actions, rewards, dones, next_states = memory.sample(128)
Also, it is a good idea to wait for your model to gain sufficient experiences before starting training -
if len(memory) > 500:
model.train()
states, actions, rewards, dones, next_states = memory.sample(128)
The other changes that I made were to ease up debugging.
I didn't see any use of class FlatExtractor(nn.Module), therefore I removed it and made the following change -
if len(obs_space.shape) == 1:
self.feature_extractor = env.observation_space.shape[0]
def forward(self, obs):
return self.net(obs)
I removed all instances of BatchNorm
Replaced loss with MSELoss and removed clip gradients
loss_fn = nn.MSELoss()
Changed the learning rate to lr=.0001
Increased the width of your neural network -
model = DQN(env.observation_space, env.action_space, 128).to(device)
Removed the target network and its corresponding soft updates.
Added in total reward to check if the algorithm is learning
tot_rew = 0
for it in range(65_000):
next_state, reward, done, info = env.step(action)
tot_rew += reward
if done:
print("tot_rew = ", tot_rew)
obs= env.reset()
tot_rew = 0
Here is the total reward I get at the end -
tot_rew = 228.0
tot_rew = 472.0
tot_rew = 243.0
tot_rew = 300.0
Here is the entire fixed code -
import gym
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
env = gym.make("CartPole-v1")
def main() -> int:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Online and offline model for learning
model = DQN(env.observation_space, env.action_space, 128).to(device)
target = DQN(env.observation_space, env.action_space, 24).to(device)
# target.eval()
# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=.0001)
loss_fn = nn.MSELoss()
memory = Memory(10_000)
obs = env.reset()
tot_rew = 0
for it in range(65_000):
# print("it = ", it)
# Do this for the batch norm
# model.eval()
# Maybe explore
if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
action = env.action_space.sample()
else:
state = wrap_input(obs, device).unsqueeze(0)
action = model(state).argmax().item()
# print("epsilon_greedy(1.0, .01, 15_000, it) = ", epsilon_greedy(1.0, .01, 15_000, it))
# print("check = ", model(state).detach().numpy())
# print("action = ", action)
# Act in environment and store the memory
next_state, reward, done, info = env.step(action)
tot_rew += reward
if done:
next_state = np.zeros(env.observation_space.shape)
memory.store([obs, action, reward, int(done), next_state])
done = done
obs = next_state
if done:
print("tot_rew = ", tot_rew)
obs= env.reset()
tot_rew = 0
# Train
if len(memory) > 500:
model.train()
states, actions, rewards, dones, next_states = memory.sample(128)
# Wrap and move all values to the cpu
states = wrap_input(states, device)
# print("states.shape = ",states.shape)
actions = wrap_input(actions, device, torch.int64, reshape=True)
next_states = wrap_input(next_states, device)
rewards = wrap_input(rewards, device, reshape=True)
dones = wrap_input(dones, device, reshape=True)
# Get current q-values
qs = model(states)
# print("qs.shape = ", qs.shape)
qs = torch.gather(qs, dim=1, index=actions)
# Compute target q-values
with torch.no_grad():
next_qs, _ = model(next_states).max(dim=1)
next_qs = next_qs.reshape(-1, 1)
target_qs = rewards + .9 * (1 - dones) * next_qs.reshape(-1, 1)
# Compute loss
loss = loss_fn(qs, target_qs)
# print("loss.shape = ", loss)
optimizer.zero_grad()
loss.backward()
# Clip gradients
# nn.utils.clip_grad_norm_(model.parameters(), 1)
# Backprop
optimizer.step()
# soft update
# with torch.no_grad():
# for target_param, local_param in zip(target.parameters(), model.parameters()):
# target_param.data.copy_(1e-2 * local_param.data + (1 - 1e-2) * target_param.data)
# if it % 200 == 0:
# target.load_state_dict(model.state_dict())
# models.py
class FlatExtractor(nn.Module):
'''Does nothing but pass the input on'''
def __init__(self, obs_space):
super(FlatExtractor, self).__init__()
self.n_flatten = 1
def forward(self, obs):
return obs
class DQN(nn.Module):
def __init__(self, obs_space, act_space, layer_size):
super(DQN, self).__init__()
# Feature extractor
if len(obs_space.shape) == 1:
self.feature_extractor = env.observation_space.shape[0]
elif len(obs_space.shape) == 3:
self.feature_extractor = NatureCnn(obs_space)
else:
raise NotImplementedErorr("This type of environment is not supported")
# Neural network
self.net = nn.Sequential(
nn.Linear(self.feature_extractor, layer_size),
nn.ReLU(),
nn.Linear(layer_size, layer_size),
nn.ReLU(),
nn.Linear(layer_size, act_space.n),
)
def forward(self, obs):
return self.net(obs)
# memory.py
import random
from collections import deque
class Memory(object):
def __init__(self, maxlen):
self.memory = deque(maxlen=maxlen)
def store(self, experience):
self.memory.append(experience)
def sample(self, n_samples):
return zip(*random.sample(self.memory, n_samples))
def __len__(self):
return len(self.memory)
# utils.py
def wrap_input(arr, device, dtype=torch.float, reshape=False):
output = torch.from_numpy(np.array(arr)).type(dtype).to(device)
if reshape:
output = output.reshape(-1, 1)
return output
def epsilon_greedy(start, end, n_steps, it):
return max(start - (start - end) * (it / n_steps), end)
main()

Related

DQN Pytorch Loss keeps increasing

I am implementing simple DQN algorithm using pytorch, to solve the CartPole environment from gym. I have been debugging for a while now, and I cant figure out why the model is not learning.
Observations:
using SmoothL1Loss performs worse than MSEloss, but loss increases for both
smaller LR in Adam does not work, I have tested using 0.0001, 0.00025, 0.0005 and default
Notes:
I have debugged various parts of the algorithm individually, and can say with good confidence that the issue is in the learn function. I am wondering if this bug is due to me misunderstanding detach in pytorch or some other framework mistake im making.
I am trying to stick as close to the original paper as possible (linked above)
References:
example: GitHub gist
example: pytroch official
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import gym
import numpy as np
class ReplayBuffer:
def __init__(self, mem_size, input_shape, output_shape):
self.mem_counter = 0
self.mem_size = mem_size
self.input_shape = input_shape
self.actions = np.zeros(mem_size)
self.states = np.zeros((mem_size, *input_shape))
self.states_ = np.zeros((mem_size, *input_shape))
self.rewards = np.zeros(mem_size)
self.terminals = np.zeros(mem_size)
def sample(self, batch_size):
indices = np.random.choice(self.mem_size, batch_size)
return self.actions[indices], self.states[indices], \
self.states_[indices], self.rewards[indices], \
self.terminals[indices]
def store(self, action, state, state_, reward, terminal):
index = self.mem_counter % self.mem_size
self.actions[index] = action
self.states[index] = state
self.states_[index] = state_
self.rewards[index] = reward
self.terminals[index] = terminal
self.mem_counter += 1
class DeepQN(nn.Module):
def __init__(self, input_shape, output_shape, hidden_layer_dims):
super(DeepQN, self).__init__()
self.input_shape = input_shape
self.output_shape = output_shape
layers = []
layers.append(nn.Linear(*input_shape, hidden_layer_dims[0]))
for index, dim in enumerate(hidden_layer_dims[1:]):
layers.append(nn.Linear(hidden_layer_dims[index], dim))
layers.append(nn.Linear(hidden_layer_dims[-1], *output_shape))
self.layers = nn.ModuleList(layers)
self.loss = nn.MSELoss()
self.optimizer = T.optim.Adam(self.parameters())
def forward(self, states):
for layer in self.layers[:-1]:
states = F.relu(layer(states))
return self.layers[-1](states)
def learn(self, predictions, targets):
self.optimizer.zero_grad()
loss = self.loss(input=predictions, target=targets)
loss.backward()
self.optimizer.step()
return loss
class Agent:
def __init__(self, epsilon, gamma, input_shape, output_shape):
self.input_shape = input_shape
self.output_shape = output_shape
self.epsilon = epsilon
self.gamma = gamma
self.q_eval = DeepQN(input_shape, output_shape, [64])
self.memory = ReplayBuffer(10000, input_shape, output_shape)
self.batch_size = 32
self.learn_step = 0
def move(self, state):
if np.random.random() < self.epsilon:
return np.random.choice(*self.output_shape)
else:
self.q_eval.eval()
state = T.tensor([state]).float()
action = self.q_eval(state).max(axis=1)[1]
return action.item()
def sample(self):
actions, states, states_, rewards, terminals = \
self.memory.sample(self.batch_size)
actions = T.tensor(actions).long()
states = T.tensor(states).float()
states_ = T.tensor(states_).float()
rewards = T.tensor(rewards).view(self.batch_size).float()
terminals = T.tensor(terminals).view(self.batch_size).long()
return actions, states, states_, rewards, terminals
def learn(self, state, action, state_, reward, done):
self.memory.store(action, state, state_, reward, done)
if self.memory.mem_counter < self.batch_size:
return
self.q_eval.train()
self.learn_step += 1
actions, states, states_, rewards, terminals = self.sample()
indices = np.arange(self.batch_size)
q_eval = self.q_eval(states)[indices, actions]
q_next = self.q_eval(states_).detach()
q_target = rewards + self.gamma * q_next.max(axis=1)[0] * (1 - terminals)
loss = self.q_eval.learn(q_eval, q_target)
self.epsilon *= 0.9 if self.epsilon > 0.1 else 1.0
return loss.item()
def learn(env, agent, episodes=500):
print('Episode: Mean Reward: Last Loss: Mean Step')
rewards = []
losses = [0]
steps = []
num_episodes = episodes
for episode in range(num_episodes):
done = False
state = env.reset()
total_reward = 0
n_steps = 0
while not done:
action = agent.move(state)
state_, reward, done, _ = env.step(action)
loss = agent.learn(state, action, state_, reward, done)
state = state_
total_reward += reward
n_steps += 1
if loss:
losses.append(loss)
rewards.append(total_reward)
steps.append(n_steps)
if episode % (episodes // 10) == 0 and episode != 0:
print(f'{episode:5d} : {np.mean(rewards):5.2f} '
f': {np.mean(losses):5.2f}: {np.mean(steps):5.2f}')
rewards = []
losses = [0]
steps = []
print(f'{episode:5d} : {np.mean(rewards):5.2f} '
f': {np.mean(losses):5.2f}: {np.mean(steps):5.2f}')
return losses, rewards
if __name__ == '__main__':
env = gym.make('CartPole-v1')
agent = Agent(1.0, 1.0,
env.observation_space.shape,
[env.action_space.n])
learn(env, agent, 500)

The main problem I think is the discount factor, gamma. You are setting it to 1.0, which mean that you are giving the same weight to the future rewards as the current one. Usually in reinforcement learning we care more about the immediate reward than the future, so gamma should always be less than 1.
Just to give it a try I set gamma = 0.99 and run your code:
Episode: Mean Reward: Last Loss: Mean Step
100 : 34.80 : 0.34: 34.80
200 : 40.42 : 0.63: 40.42
300 : 65.58 : 1.78: 65.58
400 : 212.06 : 9.84: 212.06
500 : 407.79 : 19.49: 407.79
As you can see the loss still increases (even if not as much as before), but so does the reward. You should consider that loss here is not a good metric for the performance, because you have a moving target. You can reduce the instability of the target by using a target network. With additional parameter tuning and a target network one could probably make the loss even more stable.
Also generally note that in reinforcement learning the loss value is not as important as it is in supervised; a decrease in loss does not always imply an improvement in performance, and vice versa.
The problem is that the Q target is moving while the training steps happen; as the agent plays, predicting the correct sum of rewards gets extremely hard (e.g. more states and rewards explored means higher reward variance), so the loss increases. This is even clearer in more complex the environments (more states, variated rewards, etc).
At the same time the Q network is getting better at approximating the Q values for each action, so the rewards (could) increase.

Keras Double DQN average reward decreases over time and is unable to converge

I am attempting to teach a Double DQN agent to run a gridworld where there is one seeker (the agent) who will try to collect all the hiders which are randomly spawned. Every step has a path_cost of -0.1 and if a hider is collected a reward of 1 is received. The DQN net receives an array with the shape (world_width,world_height,1) as the state which is a complete translation of the environment viewed from above where empty space is described as 0, seeker as 2, and hider as 3. The agent is then supposed to choose one action, either left, up, right, or down. An example configuration of the environment is shown in the image below.
gridworld
However, when training my agent the reward initially decreases in correlation to the decreasing exploration and therefore it can be assumed that when the agent follows the DQN net it will perform worse than when choosing actions randomly. Here are a few examples of the reward graphs I have received when training with different hyperparameters (y-axis is total steps where each episode is 100 steps unless it finishes).
Reward Graph
As seen the agent becomes worse at solving the environment and it is approximately when epsilon becomes equal to my min_epsilon the curve stabilizes (meaning almost no exploration or random moves).
I have tried different hyperparameters but without any apparent differences in results and would there appreciate it if someone could give me a pointer to where the problem might be.
The hyperparameters I have been mostly using is:
wandb.config.epsilon = 1.0
wandb.config.epsilon_decay = 0.99
wandb.config.batch_size = 32
wandb.config.learning_rate = 1e-3
wandb.config.gamma = 0.8
wandb.config.min_epsilon = 1e-1
wandb.config.buffersize = 10000
wandb.config.epochs = 1
wandb.config.reward_discount = 0.01
wandb.config.episodes = 1000
And here is my code:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import Adam
from collections import deque
from termcolor import colored
import wandb
from wandb.keras import WandbCallback
import numpy as np
import copy, os, random
from argparse import ArgumentParser
from plotter import plotter
from HNS import HNS
tf.keras.backend.set_floatx('float64')
wandb.init(name=name, project=project)
wandb.env.name = "HNS"
wandb.env.world_size = (8, 8)
wandb.env.state_dim = (8, 8, 1)
wandb.env.hider_count = 2
wandb.env.action_dim = 4
wandb.env.random_spawn = True
wandb.env.max_steps = 100
wandb.config.node = node
wandb.config.epsilon = 1.0
wandb.config.epsilon_decay = 0.99
wandb.config.batch_size = 32
wandb.config.learning_rate = 1e-3
wandb.config.gamma = 0.8
wandb.config.min_epsilon = 1e-1
wandb.config.buffersize = 10000
wandb.config.epochs = 1
wandb.config.reward_discount = 0.01
wandb.config.episodes = 1000
wandb.config.conv1_kernel = (8,8)
wandb.config.conv1_filters = 16
wandb.config.conv1_strides = 4
wandb.config.conv1_activation = "relu"
wandb.config.conv1_padding = "same"
wandb.config.conv2_kernel = (4,4)
wandb.config.conv2_filters = 32
wandb.config.conv2_strides = 4
wandb.config.conv2_activation = "relu"
wandb.config.conv2_padding = "same"
wandb.config.dense1_neurons = 16
wandb.config.dense1_activation = "relu"
wandb.config.loss = "mse"
parser = ArgumentParser()
parser.add_argument('--hider_count', type=int, default=wandb.env.hider_count)
parser.add_argument('--max_steps', type=int, default=wandb.env.max_steps)
parser.add_argument('--epsilon_decay', type=float, default=wandb.config.epsilon_decay)
parser.add_argument('--min_epsilon', type=float, default=wandb.config.min_epsilon)
parser.add_argument('--learning_rate', type=float, default=wandb.config.learning_rate)
parser.add_argument('--gamma', type=float, default=wandb.config.gamma)
parser.add_argument('--reward_discount', type=float, default=wandb.config.reward_discount)
parser.add_argument('--episodes', type=int, default=wandb.config.episodes)
parser.add_argument('--batch_size', type=int, default=wandb.config.batch_size)
args, unknown = parser.parse_known_args()
wandb.config.update(args, allow_val_change=True)
class ReplayBuffer:
def __init__(self):
self.buffer = deque(maxlen=wandb.config.buffersize)
def put(self, state, action, reward, next_state, done):
self.buffer.append([state, action, reward, next_state, done])
def sample(self):
sample = random.sample(self.buffer, wandb.config.batch_size)
states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
return states, actions, rewards, next_states, done
def size(self):
return len(self.buffer)
class ActionStatemodel:
def __init__(self):
self.epsilon = wandb.config.epsilon
self.model = self.create_model()
def create_model(self):
# Init model
model = tf.keras.Sequential()
# Set up layers
model.add(Conv2D(filters=wandb.config.conv1_filters, kernel_size=wandb.config.conv1_kernel, activation=wandb.config.conv1_activation,
strides=wandb.config.conv1_strides, padding=wandb.config.conv1_padding, name="conv_1", input_shape=wandb.env.state_dim))
model.add(Conv2D(filters=wandb.config.conv2_filters, kernel_size=wandb.config.conv2_kernel, activation=wandb.config.conv2_activation,
strides=wandb.config.conv2_strides, padding=wandb.config.conv2_padding, name="conv_2"))
model.add(Flatten())
model.add(Dense(units=wandb.config.dense1_neurons, activation=wandb.config.dense1_activation, name="dense_1"))
model.add(Dense(wandb.env.action_dim, name="dense_2"))
# Finalize model
model.compile(loss=wandb.config.loss, optimizer=Adam(wandb.config.learning_rate))
model.summary()
return model
# Get q-values from state
def predict(self, state):
return self.model.predict(state)
# Get action from
def get_action(self, state):
# Predict action
state = np.expand_dims(state, axis=0)
q_value = self.predict(state)
if np.random.random() < self.epsilon: return random.randint(0, wandb.env.action_dim - 1), 1
else: return np.argmax(q_value), 0
def train(self, states, targets):
history = self.model.fit(states, targets, epochs=wandb.config.epochs, callbacks=[WandbCallback()], verbose=2, use_multiprocessing=True)
return history.history["loss"][0]
class Agent:
def __init__(self, env):
self.env = env
self.predict_net = ActionStatemodel()
self.target_net = ActionStatemodel()
self.target_update()
self.buffer = ReplayBuffer()
# Copy weights from model to target_model
def target_update(self):
weights = self.predict_net.model.get_weights()
self.target_net.model.set_weights(weights)
def replay(self):
loss = 0
for _ in range(5):
states, actions, rewards, next_states, done = self.buffer.sample()
# Collect predicted actions from predict_net
predicted_q_values = self.predict_net.predict(next_states)
predicted_actions = np.argmax(predicted_q_values, axis=1)
# Get q values from target_net of above predicted actions
target_q_values = self.target_net.predict(next_states)
target_action_q_values = [np.take(target_q_values[i], predicted_actions[i]) for i in range(len(target_q_values))]
# Create targets based on q values, reward and done
targets = predicted_q_values.copy()
targets[range(wandb.config.batch_size), actions] = rewards + (1 - done) * target_action_q_values * args.gamma
loss += self.predict_net.train(states, targets)
return loss
def train(self):
# Save weights for heatmap rendering
# Main training loop
for ep in range(wandb.config.episodes):
# Initialization
done, total_reward, step, loss, exploration = False, 0, 0, 0, 0
state = self.env.reset()
while not done and step < wandb.env.max_steps:
# Predict and perform action
action, e = self.predict_net.get_action(state)
exploration += e
next_state, reward, done, _ = self.env.step(action)
self.buffer.put(state, action, reward * wandb.config.reward_discount, next_state, done)
total_reward += reward
if self.buffer.size() >= 1000 and step % 10 == 0:
loss = self.replay()
state = next_state
step += 1
self.target_update()
# Update epsilon
self.predict_net.epsilon = max(wandb.config.epsilon_decay * self.predict_net.epsilon, wandb.config.min_epsilon)
# Calculate weights change and log weights
pre_weights = self.get_weights(self.predict_net.model.layers)
tar_weights = self.get_weights(self.target_net.model.layers)
# LOG
print(colored("EP" + str(ep) + "-Reward: " + str(total_reward) + " Done: " + str(done), "green"))
wandb.log({"episode" : ep,
"buffersize" : self.buffer.size(),
"EpReward" : total_reward,
"epsilon" : self.predict_net.epsilon,
"done" : int(done),
"Exploration" : exploration / _,
"loss" : loss,
"pre_weights" : pre_weights,
"tar_weights" : tar_weights
})
# "weigthUpdate" : wandb.Image(neuron_map),
# Get weights and names for every layer of nn model
def get_weights(self, layers):
weigths = []
names = []
for layer in layers:
wb = layer.get_weights()
if wb:
weigths.append(wb[0].flatten())
names.append(layer.name)
return weigths, names
if __name__ == "__main__":
env = HNS(random_spawn=wandb.env.random_spawn, world_size=wandb.env.world_size, hider_count=wandb.env.hider_count)
agent = Agent(env=env)
agent.train()
agent.target_net.model.save(os.path.join(wandb.run.dir, "model.h5"))

Deep Q - Learning for Cartpole with Tensorflow in Python

I know there are many similar topics discussed on StackOverflow, but I have done quite a lot research both in StackOverflow and on the Internet and I couldn't find a solution.
I am trying to implement the classic Deep Q Learning Algorithm to solve the openAI gym's cartpole game:
OpenAI Gym Cartpole
Firstly, I created an agent that generates random weights. The results are shown in the graph below:
Amazingly, the agent managed to reach 200 steps (which is the max) in many episodes by simply generating 4 random uniform weights [w1, w2, w3, w4] from (-1.0 to 1.0) in each episode.
So, i decided to implement a simple DQN with only 4 weights and 2 biases and to make the agent learn this game over the time. The weights will be initialized randomly in the beginning and Back-Propagation will be used to update them as the agent makes steps.
I used the Epsilon Greedy strategy to make the agent explore at the beginning and exploit the Q values later on. However, The results are disappointing compared to the random agent:
I have tried to tune a lot of parameters and different architectures and the result doesn't change as much. So, my question is the following:
Question:
Did i make a wrong implementation of DQN or a simple DQN cannot beat the cartpole? What's your experience? It does reduces the loss (Error), but it doesn't guarantee a good solution.
Thanks in advance.
import tensorflow as tf
import gym
import numpy as np
import random as rand
import matplotlib.pyplot as plt
# Cartpole's Observation:
# 4 Inputs
# 2 Actions (LEFT | RIGHT)
input_size = 4
output_size = 2
# Deep Q Network Class
class DQN:
def __init__(self, var_names):
self.var_names = var_names
self._define_placeholders()
self._add_layers()
self._define_loss()
self._choose_optimizer()
self._initialize()
# Placeholders:
# Inputs: The place where we feed the Observations (States).
# Targets: Q_target = R + gamma*Q(s', a*).
def _define_placeholders(self):
self.inputs = tf.placeholder(tf.float32, shape=(None, input_size), name='inputs')
self.targets = tf.placeholder( tf.float32, shape=(None, output_size), name='targets')
# Layers:
# 4 Input Weights.
# 2 Biases.
# output = softmax(inputs*weights + biases).
# Weights and biases are initialized randomly.
def _add_layers(self):
w = tf.get_variable(name=self.var_names[0], shape=(input_size, output_size),
initializer=tf.initializers.random_uniform(minval=-1.0, maxval=1.0) )
b = tf.get_variable(name=self.var_names[1], shape=(output_size),
initializer=tf.initializers.random_uniform(minval=-1.0, maxval=1.0) )
self.outputs = tf.nn.softmax(tf.matmul(self.inputs, w) + b)
self.prediction = tf.argmax(self.outputs, 1)
# Loss = MSE.
def _define_loss(self):
self.mean_loss = tf.losses.mean_squared_error(labels=self.targets, predictions=self.outputs) / 2
# AdamOptimizer with starting learning rate: a = 0.005.
def _choose_optimizer(self):
self.optimizer = tf.train.AdamOptimizer(learning_rate=0.005).minimize(loss=self.mean_loss)
# Initializes the dqn's weights.
def _initialize(self):
initializer = tf.global_variables_initializer()
self.sess = tf.InteractiveSession()
self.sess.run(initializer)
# Get's current's DQN weights.
def get_weights(self):
return [ self.sess.run( tf.trainable_variables(var) )[0] for var in self.var_names ]
# Updates the weights of DQN.
def update_weights(self, new_weights):
variables = [tf.trainable_variables(name)[0] for name in self.var_names]
update = [ tf.assign(var, weight) for (var, weight) in zip(variables, new_weights) ]
self.sess.run(update)
# Predicts the best possible action from a state s.
# a* = argmax( Q(s) )
# Returns from Q(s), a*
def predict(self, states):
Q, actions = self.sess.run( [self.outputs, self.prediction],
feed_dict={self.inputs: states} )
return Q, actions
# It partially fits the given observations and the targets into the network.
def partial_fit(self, states, targets):
_, loss = self.sess.run( [self.optimizer, self.mean_loss],
feed_dict={self.inputs: states, self.targets: targets} )
return loss
# Replay Memory Buffer
# It stores experiences as (s,a,r,s') --> (State, Action, Reward, Next_Action).
# It generates random mini-batches of experiences from the memory.
# If the memory is full, then it deletes the oldest experiences. Experience is an step.
class ReplayMemory:
def __init__(self, mem_size):
self.mem_size = mem_size
self.experiences = []
def add_experience(self, xp):
self.experiences.append(xp)
if len(self.experiences) > self.mem_size:
self.experiences.pop(0)
def random_batch(self, batch_size):
if len(self.experiences) < batch_size:
return self.experiences
else:
return rand.sample(self.experiences, batch_size)
# The agent's class.
# It contains 2 DQNs: Online DQN for Predictions and Target DQN for the targets.
class Agent:
def __init__(self, epsilon, epsilon_decay, min_epsilon, gamma, mem_size):
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.min_epsilon = min_epsilon
self.gamma = gamma
self.replay_mem = ReplayMemory(mem_size)
self.online_dqn = DQN( var_names=['online_w', 'online_b'] )
self.target_dqn = DQN( var_names=['target_w', 'target_b'] )
self.state = None
def set_epsilon(self, epsilon):
self.epsilon = epsilon
def reduce_epsilon(self):
if self.epsilon > self.min_epsilon:
self.epsilon -= self.epsilon_decay
def update_state(self, state):
self.state = state
def update_memory(self, state, action, reward, next_state):
experience = (state, action, reward, next_state)
self.replay_mem.add_experience(experience)
# It updates the target network after N steps.
def update_network(self):
self.target_dqn.update_weights( self.online_dqn.get_weights() )
# Randomly chooses an action from the enviroment.
def explore(self, env):
action = env.action_space.sample()
return action
# Predicts and chooses the best possible moves from the current state.
def exploit(self):
_, action = self.online_dqn.predict(self.state)
return action[0]
# Uses Epsilon-Greedy to decide whether to explore or exploit.
# Epsilon starts with 1 and is reduced over the time.
# After the agent makes a move, he returns: state, action, reward, next_state.
def take_action(self, env):
action = None
p = rand.uniform(0.0, 1.0)
if p < self.epsilon:
action = self.explore(env)
else:
action = self.exploit()
next_state, reward, done, _ = env.step(action)
if done:
next_state = None
else:
next_state = np.reshape( next_state, (1, input_size) )
return self.state, action, reward, next_state, done
# Trains the agent.
# A random mini-batch is generated from the memory.
# We feed each experience into the DQN.
# For each
# Q(s) = Qtarget(s)
# Q(s'), a* = Qtarget(s'), argmax Q(s')
# We set targets = Q(s')
# For each action (a), reward (r), next_state (s') in the batch:
# If s' is None the GameOver. So, we set target[i] = Reward
# If s' != None, then target[i][a] = r + gamma*Q(s', 'a')
# Then, the online DQN calculates the mean squared difference of r + gamma*Q(s', 'a') - Q(s, a)
# and uses Back-Propagation to update the weights.
def train(self):
mini_batch = self.replay_mem.random_batch(batch_size=256)
batch_size = len(mini_batch)
states = np.zeros( shape=(batch_size, input_size) )
next_states = np.zeros( shape=(batch_size, input_size) )
for i in range(batch_size):
states[i] = mini_batch[i][0]
next_states[i] = mini_batch[i][3]
Q, _ = self.target_dqn.predict(states)
next_Q, next_actions = self.target_dqn.predict(next_states)
targets = Q
for i in range(batch_size):
action = mini_batch[i][1]
reward = mini_batch[i][2]
next_state = mini_batch[i][3]
if next_state is None:
targets[i][action] = reward
else:
targets[i][action] = reward + self.gamma * next_Q[i][ next_actions[i] ]
loss = self.online_dqn.partial_fit(states, targets)
return loss
def play(agent, env, episodes, N, render=False, train=True):
ep = 0
episode_steps = []
steps = 0
total_steps = 0
loss = 0
# Sets the current state as the initial.
# Cartpole spawns the agent in a random state.
agent.update_state( np.reshape( env.reset(), (1, input_size) ) )
agent.update_network()
while ep < episodes:
if render:
env.render()
# The target DQN's weights are frozen.
# The agent Updates the Target DQN's Weights after 100 steps.
if train and total_steps % N == 0:
agent.update_network()
print('---Target network updated---')
# Takes action.
state, action, reward, next_state, done = agent.take_action(env)
# Updates the memory and the current state.
agent.update_memory(state, action, reward, next_state)
agent.update_state(next_state)
steps += 1
total_steps += 1
if train:
loss = agent.train()
if done:
agent.update_state( np.reshape( env.reset(), (1, input_size) ) )
episode_steps.append(steps)
ep += 1
if train:
agent.reduce_epsilon()
print('End of episode', ep, 'Training loss =', loss, 'Steps =', steps)
steps = 0
if render:
env.close()
return episode_steps
env = gym.make('CartPole-v0')
# Training the agent.
agent = Agent(epsilon=1, epsilon_decay = 0.01, min_epsilon = 0.05, gamma=0.9, mem_size=50000)
episodes = 1000
N = 100
episode_steps = play(agent, env, episodes, N)
# Plotting the results.
# After the training is done, the steps should be maximized (up to 200)
plt.plot(episode_steps)
plt.show()
# Testing the agent.
agent.set_epsilon(0)
episodes = 1
steps = play(agent, env, episodes, N, render=True, train=False)[0]
print('\nSteps =', steps)

The algorithm works quite well. When I decided to plot the data, I used as a metric:
Rewards / Episode
Most of Deep Reinforcement Learning Frameworks (e.g. tf-agents) use mean reward (e.g. mean reward per 10 episodes) and this is why the plots look so smooth. If You look at the above plot, The agent manages to get a high score most of the time.
Also, I have decided to improve the speed of the algorithm using numpy operations rather than "for" loops. You can check out my implementation here:
https://github.com/kochlisGit/Deep-Reinforcement-Learning/tree/master/Custom%20DQN

DQN algorithm does not converge on CartPole-v0

Short Description of my model
I am trying to write my own DQN algorithm in Python, using Tensorflow following the paper(Mnih et al., 2015). In train_DQN function, I have defined the training procedure, and DQN_CartPole is for defining the function approximation(simple 3-layered Neural Network). For loss function, Huber loss or MSE is implemented followed by the gradient clipping(between -1 and 1). Then, I have implemented soft-update method instead of hard-update of the target network by copying the weights in the main network.
Question
I am trying it on the CartPole environment(OpenAI gym), but the rewards does not improve as it does in other people's algorithms, such as keras-rl. Any help will be appreciated.
reward over timestep
If possible, could you have a look at the source code?
DQN model: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_model.py
Training Script: https://github.com/Rowing0914/TF_RL/blob/master/agents/DQN_train.py
Reddit post: https://www.reddit.com/r/reinforcementlearning/comments/ba7o55/question_dqn_algorithm_does_not_work_well_on/?utm_source=share&utm_medium=web2x
class Parameters:
def __init__(self, mode=None):
assert mode != None
print("Loading Params for {} Environment".format(mode))
if mode == "Atari":
self.state_reshape = (1, 84, 84, 1)
self.num_frames = 1000000
self.memory_size = 10000
self.learning_start = 10000
self.sync_freq = 1000
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 1000
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
elif mode == "CartPole":
self.state_reshape = (1, 4)
self.num_frames = 10000
self.memory_size = 20000
self.learning_start = 100
self.sync_freq = 100
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 500
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
class _DQN:
"""
Boilerplate for DQN Agent
"""
def __init__(self):
"""
define the deep learning model here!
"""
pass
def predict(self, sess, state):
"""
predict q-values given a state
:param sess:
:param state:
:return:
"""
return sess.run(self.pred, feed_dict={self.state: state})
def update(self, sess, state, action, Y):
feed_dict = {self.state: state, self.action: action, self.Y: Y}
_, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
# print(action, Y, sess.run(self.idx_flattened, feed_dict=feed_dict))
return loss
class DQN_CartPole(_DQN):
"""
DQN Agent for CartPole game
"""
def __init__(self, scope, env, loss_fn ="MSE"):
self.scope = scope
self.num_action = env.action_space.n
with tf.variable_scope(scope):
self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32, name="X")
self.Y = tf.placeholder(shape=[None], dtype=tf.float32, name="Y")
self.action = tf.placeholder(shape=[None], dtype=tf.int32, name="action")
fc1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(self.state)
fc2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc1)
fc3 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc2)
self.pred = tf.keras.layers.Dense(self.num_action, activation=tf.nn.relu)(fc3)
# indices of the executed actions
self.idx_flattened = tf.range(0, tf.shape(self.pred)[0]) * tf.shape(self.pred)[1] + self.action
# passing [-1] to tf.reshape means flatten the array
# using tf.gather, associate Q-values with the executed actions
self.action_probs = tf.gather(tf.reshape(self.pred, [-1]), self.idx_flattened)
if loss_fn == "huber_loss":
# use huber loss
self.losses = tf.subtract(self.Y, self.action_probs)
self.loss = huber_loss(self.losses)
elif loss_fn == "MSE":
# use MSE
self.losses = tf.squared_difference(self.Y, self.action_probs)
self.loss = tf.reduce_mean(self.losses)
else:
assert False
# you can choose whatever you want for the optimiser
# self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
self.optimizer = tf.train.AdamOptimizer()
# to apply Gradient Clipping, we have to directly operate on the optimiser
# check this: https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#processing_gradients_before_applying_them
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
self.clipped_grads_and_vars = [(ClipIfNotNone(grad, -1., 1.), var) for grad, var in self.grads_and_vars]
self.train_op = self.optimizer.apply_gradients(self.clipped_grads_and_vars)
def train_DQN(main_model, target_model, env, replay_buffer, policy, params):
"""
Train DQN agent which defined above
:param main_model:
:param target_model:
:param env:
:param params:
:return:
"""
# log purpose
losses, all_rewards, cnt_action = [], [], []
episode_reward, index_episode = 0, 0
with tf.Session() as sess:
# initialise all variables used in the model
sess.run(tf.global_variables_initializer())
state = env.reset()
start = time.time()
for frame_idx in range(1, params.num_frames + 1):
action = policy.select_action(sess, target_model, state.reshape(params.state_reshape))
cnt_action.append(action)
next_state, reward, done, _ = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
if done:
index_episode += 1
state = env.reset()
all_rewards.append(episode_reward)
if frame_idx > params.learning_start and len(replay_buffer) > params.batch_size:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = target_model.predict(sess, next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = main_model.update(sess, states, actions, Y)
# Logging and refreshing log purpose values
losses.append(np.mean(loss))
logging(frame_idx, params.num_frames, index_episode, time.time()-start, episode_reward, np.mean(loss), cnt_action)
episode_reward = 0
cnt_action = []
start = time.time()
if frame_idx > params.learning_start and frame_idx % params.sync_freq == 0:
# soft update means we partially add the original weights of target model instead of completely
# sharing the weights among main and target models
if params.update_hard_or_soft == "hard":
sync_main_target(sess, main_model, target_model)
elif params.update_hard_or_soft == "soft":
soft_target_model_update(sess, main_model, target_model, tau=params.soft_update_tau)
return all_rewards, losses
Modification
dones -> np.logical_not(dones)
np.argmax -> np.max
separating MSE from huber_loss

Briefly looking over, it seems that the dones variable is a binary vector where 1 denotes done, and 0 denotes not-done.
You then use dones here:
Y = rewards + params.gamma * np.argmax(next_Q, axis=1) * dones
So for all terminating transitions, you add the expected cumulative reward when following the policy for the rest of the episode (which is zero). For all non-terminating transitions, you do not add the expect cumulative reward.
I think you mean to do this the other way around, perhaps swap dones in the above line of code with np.logical_not(dones)?
Also, now that I look at it, it seems there is another major problem with this line. np.argmax(next_Q, axis=1) returns the index of the maximum value in next_Q vector, not the actual maximum value. You need np.maximum(next_Q, axis=1) (IIRC) to get the maximum expected reward of the next state's actions.
EDIT: The loss function is also very strangely defined. You are mixing Huber Loss with Mean-Squared-Error. If you want to use either huber_loss or MSE, you just compute them on the difference between the expected and predicted values. You appear to be doing both, which is certainly not a commonly defined loss function. For example, your model loss to use Huber Loss should just be:
self.loss = tf.reduce_mean(huber_loss(abs(self.Y - self.action_probs)))

DQN with prioritized experience replay and target network does not improve

I tried to code a neural network to solve OpenAI's CartPole environment with Tensorflow and Keras. The network uses prioritized experience replay and a seperate target network which is updated every tenth episode. Here's the code:
import numpy as np
import gym
import matplotlib.pyplot as plt
from collections import deque
from tensorflow import keras
class agent:
def __init__(self,inputs,outputs,gamma):
self.gamma = gamma
self.epsilon = 1.0
self.epsilon_decay = 0.999
self.epsilon_min = 0.01
self.inputs = inputs
self.outputs = outputs
self.network = self._build_net()
self.target_network = self._build_net()
self.replay_memory = deque(maxlen=100000)
self.target_network.set_weights(self.network.get_weights())
def _build_net(self):
model = keras.models.Sequential()
model.add(keras.layers.Dense(10,activation='relu',input_dim=self.inputs))
model.add(keras.layers.Dense(10,activation='relu'))
model.add(keras.layers.Dense(10,activation='relu'))
model.add(keras.layers.Dense(self.outputs,activation='sigmoid'))
model.compile(optimizer='adam',loss='categorical_crossentropy')
return model
def act(self,state,testing=False):
if not testing:
if self.epsilon > np.random.rand():
return np.random.randint(self.outputs)
else:
return np.argmax(self.network.predict(np.array([state])))
else:
return np.argmax(self.network.predict(np.array([state])))
def remember(self,state,action,next_state,reward,win):
self.replay_memory.append((state,action,next_state,reward,win))
def get_batch(self):
batch = []
losses = []
targets = []
if len(self.replay_memory) >= 32:
for state,action,next_state,reward,done in self.replay_memory:
target_f = np.zeros([1,self.outputs])
if done != False:
target = (reward + self.gamma * np.amax(self.target_network.predict(np.array([next_state]))[0]))
target_f[0][action] = target
targets.append(target_f)
loss = np.mean(self.network.predict(np.array([state]))-target_f)**2
losses.append(loss)
indexes = np.argsort(losses)[:32]
for indx in indexes:
batch.append((self.replay_memory[indx][0], targets[indx]))
return batch
def replay(self,batch):
for state,target in batch:
self.network.fit(np.array([state]), target, epochs=1, verbose=0)
self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)
def update_target_network(self):
self.target_network.set_weights(self.network.get_weights())
def save(self):
self.network.save_weights('./DQN.model')
def load(self):
self.network.load_weights('./DQN.model')
env = gym.make('CartPole-v0')
episodes = 1000
agent = agent(env.observation_space.shape[0],env.action_space.n,0.99)
rewards = []
state = env.reset()
for _ in range(500):
action = agent.act(state,1.0)
next_state, reward, win, _ = env.step(action)
agent.remember(state,action,next_state,reward,win)
state = next_state
if win:
state = env.reset()
for e in range(episodes):
print('episode:',e+1)
batch = agent.get_batch()
state = env.reset()
for t in range(400):
action = agent.act(state)
next_state, reward, win, _ = env.step(action)
agent.remember(state,action,next_state,reward,win)
state = next_state
if win:
break
agent.replay(batch)
print('score:',t)
print('epsilon:',agent.epsilon)
print('')
if e%10 == 0:
agent.update_target_network()
agent.save()
rewards.append(t)
plt.plot(list(range(e+1)),rewards)
plt.savefig('./reward.png')
The problem is that the agent gets worse as epsilon decreases. And when epsilon is a its lowest value, the agent only gets through 7-9 steps before the Pole falls, as seen in the Image below. Can someone tell me why my agent isn't learning anything and how to fix it?

Your network output should be using linear as activation as opposed to sigmoid, since you are bootstrapping summation of all future rewards. The loss should be changed to mse as well, according to the derivation for the td error from the original paper.
Also, if the experience ended in a terminal state (done==True), the td target for the action taken should set equal to the reward (you are currently setting it as 0).

Try making the reward = -reward if done. It helps

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.