I tried to implement the most simple Deep Q Learning algorithm. I think, I've implemented it right and know that Deep Q Learning struggles with divergences but the reward is declining very fast and the loss is diverging. I would be grateful if someone could help me pointing out the right hyperparameters or if I implemented the algorithm wrong. I've tried a lot of hyperparameter combinations and also changing the complexity of the QNet.
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import collections
import numpy as np
import matplotlib.pyplot as plt
import gym
from torch.nn.modules.linear import Linear
from torch.nn.modules.loss import MSELoss
class ReplayBuffer:
def __init__(self, max_replay_size, batch_size):
self.max_replay_size = max_replay_size
self.batch_size = batch_size
self.buffer = collections.deque()
def push(self, *transition):
if len(self.buffer) == self.max_replay_size:
self.buffer.popleft()
self.buffer.append(transition)
def sample_batch(self):
indices = np.random.choice(len(self.buffer), self.batch_size, replace = False)
batch = [self.buffer[index] for index in indices]
state, action, reward, next_state, done = zip(*batch)
state = np.array(state)
action = np.array(action)
reward = np.array(reward)
next_state = np.array(next_state)
done = np.array(done)
return state, action, reward, next_state, done
def __len__(self):
return len(self.buffer)
class QNet(nn.Module):
def __init__(self, state_dim, action_dim):
super(QNet, self).__init__()
self.linear1 = Linear(in_features = state_dim, out_features = 64)
self.linear2 = Linear(in_features = 64, out_features = action_dim)
def forward(self, x):
x = self.linear1(x)
x = F.relu(x)
x = self.linear2(x)
return x
def train(replay_buffer, model, target_model, discount_factor, mse, optimizer):
state, action, reward, next_state, _ = replay_buffer.sample_batch()
state, next_state = torch.tensor(state, dtype = torch.float), torch.tensor(next_state,
dtype = torch.float)
# Compute Q Value and Target Q Value
q_values = model(state).gather(1, torch.tensor(action, dtype = torch.int64).unsqueeze(-1))
with torch.no_grad():
max_next_q_values = target_model(next_state).detach().max(1)[0]
q_target_value = torch.tensor(reward, dtype = torch.float) + discount_factor *
max_next_q_values
optimizer.zero_grad()
loss = mse(q_values, q_target_value.unsqueeze(1))
loss.backward()
optimizer.step()
return loss.item()
def main():
# Define Hyperparameters and Parameters
EPISODES = 10000
MAX_REPLAY_SIZE = 10000
BATCH_SIZE = 32
EPSILON = 1.0
MIN_EPSILON = 0.05
DISCOUNT_FACTOR = 0.95
DECAY_RATE = 0.99
LEARNING_RATE = 1e-3
SYNCHRONISATION = 33
EVALUATION = 32
# Initialize Environment, Model, Target-Model, Optimizer, Loss Function and Replay Buffer
env = gym.make("CartPole-v0")
model = QNet(state_dim = env.observation_space.shape[0], action_dim =
env.action_space.n)
target_model = QNet(state_dim = env.observation_space.shape[0], action_dim =
env.action_space.n)
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
mse = MSELoss()
replay_buffer = ReplayBuffer(max_replay_size = MAX_REPLAY_SIZE, batch_size = BATCH_SIZE)
while len(replay_buffer) != MAX_REPLAY_SIZE:
state = env.reset()
done = False
while done != True:
action = env.action_space.sample()
next_state, reward, done, _ = env.step(action)
replay_buffer.push(state, action, reward, next_state, done)
state = next_state
# Begin with the Main Loop where the QNet is trained
count_until_synchronisation = 0
count_until_evaluation = 0
history = {'Episode': [], 'Reward': [], 'Loss': []}
for episode in range(EPISODES):
total_reward = 0.0
total_loss = 0.0
state = env.reset()
iterations = 0
done = False
while done != True:
count_until_synchronisation += 1
count_until_evaluation += 1
# Take an action
if np.random.rand(1) < EPSILON:
action = env.action_space.sample()
else:
with torch.no_grad():
output = model(torch.tensor(state, dtype = torch.float)).numpy()
action = np.argmax(output)
# Observe new state and reward + store into replay_buffer
next_state, reward, done, _ = env.step(action)
total_reward += reward
replay_buffer.push(state, action, reward, next_state, done)
state = next_state
if count_until_synchronisation % SYNCHRONISATION == 0:
target_model.load_state_dict(model.state_dict())
if count_until_evaluation % EVALUATION == 0:
loss = train(replay_buffer = replay_buffer, model = model, target_model =
target_model, discount_factor = DISCOUNT_FACTOR,
mse = mse, optimizer = optimizer)
total_loss += loss
iterations += 1
print (f"Episode {episode} is concluded in {iterations} iterations with a total reward
of {total_reward}")
if EPSILON > MIN_EPSILON:
EPSILON *= DECAY_RATE
history['Episode'].append(episode)
history['Reward'].append(total_reward)
history['Loss'].append(total_loss)
# Plot the Loss + Reward per Episode
fig, ax = plt.subplots(figsize = (10, 6))
ax.plot(history['Episode'], history['Reward'], label = "Reward")
ax.set_xlabel('Episodes', fontsize = 15)
ax.set_ylabel('Total Reward per Episode', fontsize = 15)
plt.legend(prop = {'size': 15})
plt.show()
fig, ax = plt.subplots(figsize = (10, 6))
ax.plot(history['Episode'], history['Loss'], label = "Loss")
ax.set_xlabel('Episodes', fontsize = 15)
ax.set_ylabel('Total Loss per Episode', fontsize = 15)
plt.legend(prop = {'size': 15})
plt.show()
if __name__ == "__main__":
main()
Your code looks fine, I think your hyperparameters are not ideal. I would change two, potentially three things:
If I'm not mistaken, you update your target net every 32 steps. That is way too low I think. In the original paper by Mnih et al., they do a hard update every 10k steps. Think about it: The target net is used to calculate the loss, you essentially change the loss function every 32 steps, which would be more than once per episode.
Your replay buffer size is pretty small. I would set it to 100k or 1M, even if that is longer than what you intend to train for. If the replay buffer is too small, you will lose the older transitions, which can cause your network to "forget" things it already learned. Not sure how dramatic this is for cartpole, but maybe worth trying...
Learning rate could also be lower, I am using 1-e4 with RMSProp. Generally changing the optimizer can also yield different results.
Hope this helps, good luck :)
Your code looks fine and well written, the hyperparams seem reasonable (except maybe for the update frequency which may be too low), I think that the Q network is quite small with a single dense layer.
A deeper model could likely do better (probably not more than 3-4 layers though), but you said that you already tried different network sizes.
Another thing that comes to mind is the target update. You are doing an hard update every n steps; a soft update may help a bit, but I wouldn't count on it.
You can also try lowering the learning rate a bit, but I imagine you already did that.
My suggestions are:
try less frequent target updates
try a larger (deeper, something like 2/3 dense layers with 32 nodes), if you haven't already
look into soft target updates (polyak averaging and so on)
try your implementation in other simple gym environments and check if its behavior is still the same.
Sadly DQN is not ideal and won't converge for many problems, but it should be able to solve cartpole.
This answer might be a little late for OP. But the implementation does not take the terminal values into account. The only signal, from some environments (ex. CartPole), that the agent is doing something wrong, is from the terminal value. In CartPole the agent gets a +1 reward for every step it takes, but it has no idea about time, so if you don't create zero value targets for terminal states then the value tries to converge towards an infinite sum of discounted +1 for every state.
The simplest way to include the zero values for terminal states is to also sample a batch of (dones) and simply multiply them with the value targets you calculated before.
Related
I'm wondering why every new observation of my Pong gym environment takes so long to load onto my GPU.
I am attempting to train a Policy Gradient model to play Pong. I have included my entire code below for this model, but my question is about why it takes so long to load and process the matrices on CUDA. The Atari, Pong gym environment cannot run on CUDA as far as I know, so I am converting Numpy arrays to Pytorch tensors at each reset observation and each step observation.
The observation of the Pong screen is 6400 pixels, or an 80 x 80 tensor.
The code below is my adaptation of this code, which is purely in Numpy and Python. THIS NUMPY VERSION RUNS FASTER ON MY CPU THAN MY CODE, which I attempted to rewrite for Pytorch to run on my GPU. I'm not posting to ask if I adapted every part of the original code well. I am only posting to ask why the tensors and model are so slow to get on CUDA and to run on CUDA.
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import numpy as np
import random
import time
import gym
from gym.spaces import Discrete, Box
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network.
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
def reward_to_go(rews):
n = len(rews)
rtgs = np.zeros_like(rews)
for i in reversed(range(n)):
rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
discounted_r = np.zeros_like(rtgs)
running_add = 0
for t in reversed(range(0, rtgs.size)):
if rtgs[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
running_add = running_add * gamma + rtgs[t]
discounted_r[t] = running_add
discounted_epr = discounted_r.copy()
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
return discounted_epr
from ale_py import ALEInterface
ale = ALEInterface()
from ale_py.roms import Pong
ale.loadROM(Pong)
env = gym.make("ALE/Pong-v5")
lr=1e-2
epochs=50
batch_size=5000
render=False
# make environment, check spaces, get obs / act dims
assert isinstance(env.observation_space, Box), \
"This example only works for envs with continuous state spaces."
assert isinstance(env.action_space, Discrete), \
"This example only works for envs with discrete action spaces."
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
# make core of policy network
logits_net = mlp([6400, 3200, 1600, 2]).cuda(device)
def prepro(I):
I = I[35:195] # crop
I = I[::2,::2,0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.ravel()
# make function to compute action distribution
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits)
# make action selection function (outputs int actions, sampled from policy)
def get_action(obs):
return get_policy(obs).sample().item()
# make loss function whose gradient, for the right data, is policy gradient
def compute_loss(obs, act, weights):
logp = get_policy(obs).log_prob(act)
return -(logp * weights).mean()
# make optimizer
optimizer = Adam(logits_net.parameters(), lr=lr)
# for training policy
def train_one_epoch():
prev_x = None
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for reward-to-go weighting in policy gradient
batch_rets = [] # for measuring episode returns
batch_lens = [] # for measuring episode lengths
# reset episode-specific variables
obs = env.reset() # first obs comes from starting distribution
done = False # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
# render first episode of each epoch
finished_rendering_this_epoch = False
# collect experience by acting in the environment with current policy
while True:
# preprocess the observation, set input to network to be difference image
cur_x = prepro(torch.as_tensor(obs, dtype=torch.float32, device=device))
x = cur_x - prev_x if prev_x is not None else np.zeros(6400)
prev_x = cur_x
act = get_action(torch.as_tensor(x, dtype=torch.float32, device=device))
# rendering
if (not finished_rendering_this_epoch) and render:
env.render()
# save obs
batch_obs.append(obs.copy())
# act in the environment
obs, rew, done, _ = env.step(act)
cur_x = prepro(torch.as_tensor(obs, dtype=torch.float32, device=device))
x = cur_x - prev_x
prev_x = cur_x
act = get_action(torch.as_tensor(x, dtype=torch.float32, device=device))
# save action, reward
batch_acts.append(act)
ep_rews.append(rew)
if done:
print("done one")
# if episode is over, record info about episode
ep_ret, ep_len = sum(ep_rews), len(ep_rews)
batch_rets.append(ep_ret)
batch_lens.append(ep_len)
# the weight for each logprob(a_t|s_t) is reward-to-go from t
batch_weights += list(reward_to_go(ep_rews))
# reset episode-specific variables
obs, done, ep_rews = env.reset(), False, []
# won't render again this epoch
finished_rendering_this_epoch = True
# end experience loop if we have enough of it
if len(batch_obs) > batch_size:
break
# take a single policy gradient update step
optimizer.zero_grad()
batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
act=torch.as_tensor(batch_acts, dtype=torch.int32),
weights=torch.as_tensor(batch_weights, dtype=torch.float32)
)
batch_loss.backward()
optimizer.step()
return batch_loss, batch_rets, batch_lens
# training loop
for i in range(1):
batch_loss, batch_rets, batch_lens = train_one_epoch()
print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
(i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
I tried to write a Policy Gradient algorithm for the Video game Pong.
Here's the Code:
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
from os import getcwd
num_episodes = 1000
learning_rate = 0.01
rewards = []
env_name = 'Pong-v0'
env = gym.make(env_name)
x = tf.placeholder(tf.float32,(None,)+env.observation_space.shape)
y = tf.placeholder(tf.float32,(None,env.action_space.n))
def net(x):
layer1 = tf.layers.flatten(x)
layer2 = tf.layers.dense(layer1,200,activation=tf.nn.softmax)
layer3 = tf.layers.dense(layer2,env.action_space.n,activation=tf.nn.softmax)
return layer3
logits = net(x)
loss = tf.losses.sigmoid_cross_entropy(y,logits)
train = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
saver = tf.train.Saver()
init = tf.global_variables_initializer()
sess = tf.Session()
with tf.device('/device:GPU:0'):
sess.run(init)
for episode in range(num_episodes):
print('episode:',episode+1)
total_reward = 0
losses = []
training_data = []
observation = env.reset()
while True:
if max(0.1, (episode+1)/num_episodes) > np.random.uniform():
probs = sess.run(logits,feed_dict={x:[observation]})[0]
action = np.argmax(probs)
else:
action = env.action_space.sample()
onehot = np.zeros(env.action_space.n)
onehot[action] = 1
training_data.append([observation,onehot])
observation, reward, done, _ = env.step(action)
total_reward += reward
if done:
break
if total_reward >= 0:
learning_rate = 0.01
else:
learning_rate = -0.01
for sample in training_data:
l,_ = sess.run([loss,train],feed_dict={x:[sample[0]], y:[sample[1]]})
losses.append(l)
print('loss:',l)
print('average loss:',sum(losses)/len(losses))
saver.save(sess,getcwd()+'/model.ckpt')
rewards.append(total_reward)
plt.plot(range(episode+1),rewards)
plt.ylabel('total reward')
plt.xlabel('episodes')
plt.savefig(getcwd()+'/reward_plot.png')
But after I trained my Network, the plot which the script made seemed to suggest that the Network got worse towards the end. Also during the last Episode the loss was the same for all Training examples (~0.68) and when I try to test the Network, the paddle of the Player just sits there motionless. Is there any way I can improve my Code?
I would ask you to familiarize yourself with how to code neural networks using tensorflow because there is where the problem lies. You provide activation=tf.nn.softmax in both the nn layers which should be a terminal layer (since you are trying to find the maximum action probability). You can change it to tf.nn.relu in the second layer. There is a bigger problem with the learning_rate:
if total_reward >= 0:
learning_rate = 0.01
else:
learning_rate = -0.01
Negative learning rate makes absolutely no sense. You want the learning rate to be positive (you can use a constant 0.01 for now).
Also, another comment, you have not mentioned the observation_space shape but I am going to assume it is a 2D matrix. Then you can reshape it before inputting it into x. So you would not need to unnecessarily use tf.flatten.
I am trying to create a reinforcement learning agent that can buy, sell or hold stock positions. The issue I'm having is that even after over 2000 episodes, the agent still can not learn when to buy, sell or hold. Here is an image from the 2100th episode detailing what I mean, the agent will not take any action unless it is random.
The agent learns using a replay memory and I have double and triple checked that there are no errors. Here is the code for the agent:
import numpy as np
import tensorflow as tf
import random
from collections import deque
from .agent import Agent
class Agent(Agent):
def __init__(self, state_size = 7, window_size = 1, action_size = 3,
batch_size = 32, gamma=.95, epsilon=.95, epsilon_decay=.95, epsilon_min=.01,
learning_rate=.001, is_eval=False, model_name="", stock_name="", episode=1):
"""
state_size: Size of the state coming from the environment
action_size: How many decisions the algo will make in the end
gamma: Decay rate to discount future reward
epsilon: Rate of randomly decided action
epsilon_decay: Rate of decrease in epsilon
epsilon_min: The lowest epsilon can get (limit to the randomness)
learning_rate: Progress of neural net in each iteration
episodes: How many times data will be run through
"""
self.state_size = state_size
self.window_size = window_size
self.action_size = action_size
self.batch_size = batch_size
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.epsilon_min = epsilon_min
self.learning_rate = learning_rate
self.is_eval = is_eval
self.model_name = model_name
self.stock_name = stock_name
self.q_values = []
self.layers = [150, 150, 150]
tf.reset_default_graph()
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement = True))
self.memory = deque()
if self.is_eval:
model_name = stock_name + "-" + str(episode)
self._model_init()
# "models/{}/{}/{}".format(stock_name, model_name, model_name + "-" + str(episode) + ".meta")
self.saver = tf.train.Saver()
self.saver.restore(self.sess, tf.train.latest_checkpoint("models/{}/{}".format(stock_name, model_name)))
# self.graph = tf.get_default_graph()
# names=[tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
# self.X_input = self.graph.get_tensor_by_name("Inputs/Inputs:0")
# self.logits = self.graph.get_tensor_by_name("Output/Add:0")
else:
self._model_init()
self.sess.run(self.init)
self.saver = tf.train.Saver()
path = "models/{}/6".format(self.stock_name)
self.writer = tf.summary.FileWriter(path)
self.writer.add_graph(self.sess.graph)
def _model_init(self):
"""
Init tensorflow graph vars
"""
# (1,10,9)
with tf.device("/device:GPU:0"):
with tf.name_scope("Inputs"):
self.X_input = tf.placeholder(tf.float32, [None, self.state_size], name="Inputs")
self.Y_input = tf.placeholder(tf.float32, [None, self.action_size], name="Actions")
self.rewards = tf.placeholder(tf.float32, [None, ], name="Rewards")
# self.lstm_cells = [tf.contrib.rnn.GRUCell(num_units=layer)
# for layer in self.layers]
#lstm_cell = tf.contrib.rnn.LSTMCell(num_units=n_neurons, use_peepholes=True)
#gru_cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)
# self.multi_cell = tf.contrib.rnn.MultiRNNCell(self.lstm_cells)
# self.outputs, self.states = tf.nn.dynamic_rnn(self.multi_cell, self.X_input, dtype=tf.float32)
# self.top_layer_h_state = self.states[-1]
# with tf.name_scope("Output"):
# self.out_weights=tf.Variable(tf.truncated_normal([self.layers[-1], self.action_size]))
# self.out_bias=tf.Variable(tf.zeros([self.action_size]))
# self.logits = tf.add(tf.matmul(self.top_layer_h_state,self.out_weights), self.out_bias)
fc1 = tf.contrib.layers.fully_connected(self.X_input, 512, activation_fn=tf.nn.relu)
fc2 = tf.contrib.layers.fully_connected(fc1, 512, activation_fn=tf.nn.relu)
fc3 = tf.contrib.layers.fully_connected(fc2, 512, activation_fn=tf.nn.relu)
fc4 = tf.contrib.layers.fully_connected(fc3, 512, activation_fn=tf.nn.relu)
self.logits = tf.contrib.layers.fully_connected(fc4, self.action_size, activation_fn=None)
with tf.name_scope("Cross_Entropy"):
self.loss_op = tf.losses.mean_squared_error(self.Y_input,self.logits)
self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate)
self.train_op = self.optimizer.minimize(self.loss_op)
# self.correct = tf.nn.in_top_k(self.logits, self.Y_input, 1)
# self.accuracy = tf.reduce_mean(tf.cast(self., tf.float32))
tf.summary.scalar("Reward", tf.reduce_mean(self.rewards))
tf.summary.scalar("MSE", self.loss_op)
# Merge all of the summaries
self.summ = tf.summary.merge_all()
self.init = tf.global_variables_initializer()
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.rand() <= self.epsilon and not self.is_eval:
prediction = random.randrange(self.action_size)
if prediction == 1 or prediction == 2:
print("Random")
return prediction
act_values = self.sess.run(self.logits, feed_dict={self.X_input: state.reshape((1, self.state_size))})
if np.argmax(act_values[0]) == 1 or np.argmax(act_values[0]) == 2:
pass
return np.argmax(act_values[0])
def replay(self, time, episode):
print("Replaying")
mini_batch = []
l = len(self.memory)
for i in range(l - self.batch_size + 1, l):
mini_batch.append(self.memory[i])
mean_reward = []
x = np.zeros((self.batch_size, self.state_size))
y = np.zeros((self.batch_size, self.action_size))
for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
target = reward
if not done:
self.target = reward + self.gamma * np.amax(self.sess.run(self.logits, feed_dict = {self.X_input: next_state.reshape((1, self.state_size))})[0])
current_q = (self.sess.run(self.logits, feed_dict={self.X_input: state.reshape((1, self.state_size))}))
current_q[0][action] = self.target
x[i] = state
y[i] = current_q.reshape((self.action_size))
mean_reward.append(self.target)
#target_f = np.array(target_f).reshape(self.batch_size - 1, self.action_size)
#target_state = np.array(target_state).reshape(self.batch_size - 1, self.window_size, self.state_size)
_, c, s = self.sess.run([self.train_op, self.loss_op, self.summ], feed_dict={self.X_input: x, self.Y_input: y, self.rewards: mean_reward}) # Add self.summ into the sess.run for tensorboard
self.writer.add_summary(s, global_step=(episode+1)/(time+1))
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
Once the replay memory is greater than the batch size, it runs the replay function. The code might look a little messy since I have been messing with it for days now trying to figure this out. Here is a screenshot of the MSE from tensorboard.
As you can see by the 200th episode the MSE dies out to 0 or almost 0. I'm stumped! I have no idea what is going on. Please help me figure this out. The code is posted here to see the whole thing including the train and eval files. The main agent I have been working on is LSTM.py in tha agents folder. Thanks!
As discussed in the comments of the question, this seems to be a problem of the high learning rate decay.
Essentially, with every episode you multiply your learning rate by some factor j, which means that your learning rate after n episodes/epochs will be equal to
lr = initial_lr * j^n. In our example, the decay is set to 0.95, which means that after only a few iterations, the learning rate will already drop significantly. Subsequently, the updates will only perform minute corrections, and not "learn" very significant changes anymore.
This leads to the question: Why does decay make sense at all? Generally speaking, we want to reach a local optimum (potentially very narrow). To do so, we try and get "relatively close" to such a minimum, and then only do smaller increments that lead us to this optimum. If we would just continue with the original learning rate, it could be that we simply step over the optimal solution every time, and never reach our goal.
Visually, the problem can be summed up by this graphic:
Another method besides decay is to simply decrease the learning rate by a certain amount once the algorithm does not reach any significant updates anymore. This avoids the problem of diminishing learning rates purely by having many episodes.
In your case specifically, a higher value for the decay (i.e. a slower decay) seemed to have helped already quite a lot.
the Q value in reinforcement learning does not represent 'reward' but 'return' which is the sum of current reward and discount future rewards. When your model enters this 'dead end' of all zero actions, the rewards will be zero based on your setting. Then after a period of time, your replay will be full of memories of 'An action of zero gives the reward of zero', so no matter how you update your model it cannot get out of this dead end.
As #dennlinger said, you may increase your epsilon to let your model have some fresh memories to update, also you could use prioritized experience replay to train on 'useful' experiences.
However, I suggest you look at is the environment itself first. Your model output zeroes because there are no better choices, is that true? As you said you were trading stocks, are you sure there is enough information that leads to a strategy that will lead you to a reward that is larger than zero? I think you need to think through this first before you do any tuning on this. For example, if the stock is moving up or down at a pure random 50/50 chance, then you'll never find a strategy that will make average reward larger than zero.
The reinforcement learning agent might already found out the best one, though it's not what you want.
Maybe my question will seem stupid.
I'm studying the Q-learning algorithm. In order to better understand it, I'm trying to remake the Tenzorflow code of this FrozenLake example into the Keras code.
My code:
import gym
import numpy as np
import random
from keras.layers import Dense
from keras.models import Sequential
from keras import backend as K
import matplotlib.pyplot as plt
%matplotlib inline
env = gym.make('FrozenLake-v0')
model = Sequential()
model.add(Dense(16, activation='relu', kernel_initializer='uniform', input_shape=(16,)))
model.add(Dense(4, activation='softmax', kernel_initializer='uniform'))
def custom_loss(yTrue, yPred):
return K.sum(K.square(yTrue - yPred))
model.compile(loss=custom_loss, optimizer='sgd')
# Set learning parameters
y = .99
e = 0.1
#create lists to contain total rewards and steps per episode
jList = []
rList = []
num_episodes = 2000
for i in range(num_episodes):
current_state = env.reset()
rAll = 0
d = False
j = 0
while j < 99:
j+=1
current_state_Q_values = model.predict(np.identity(16)[current_state:current_state+1], batch_size=1)
action = np.reshape(np.argmax(current_state_Q_values), (1,))
if np.random.rand(1) < e:
action[0] = env.action_space.sample() #random action
new_state, reward, d, _ = env.step(action[0])
rAll += reward
jList.append(j)
rList.append(rAll)
new_Qs = model.predict(np.identity(16)[new_state:new_state+1], batch_size=1)
max_newQ = np.max(new_Qs)
targetQ = current_state_Q_values
targetQ[0,action[0]] = reward + y*max_newQ
model.fit(np.identity(16)[current_state:current_state+1], targetQ, verbose=0, batch_size=1)
current_state = new_state
if d == True:
#Reduce chance of random action as we train the model.
e = 1./((i/50) + 10)
break
print("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")
When I run it, it doesn't work well: Percent of succesful episodes: 0.052%
plt.plot(rList)
The original Tensorflow code is much more better: Percent of succesful episodes: 0.352%
plt.plot(rList)
What have I done wrong ?
Besides setting use_bias=False as #Maldus mentioned in the comments, another thing you can try is to start with a higher epsilon value (e.g. 0.5, 0.75)? A trick might be to only decrease the epsilon value IF you reach the goal. i.e. don't decrease epsilon on the end of every episode. That way your player can keep on exploring the map randomly, until it starts to converge on a good route, and then it'll be a good idea to reduce the epsilon parameter.
I've actually implemented a similar model in keras in this gist using Convolutional layers instead of Dense layers. Managed to get it to work in under 2000 episodes. Might be of some help to others :)
I am trying to create a policy network that plays Pong. When run, the GPU is exhausted after only three or four games, as if the data size is growing which it shouldn't be. The program plays and stores data quickly when the optimizer run line, sess.run(optimizer, feed_dict=feed_dict), is commented out. So I believe my handling of the optimizer and the amount of data is the problem? I am new to tensorflow and machine learning in general so any help would be greatly appreciated. The following is a snippet of code (non-working) that gives the basic idea:
with tf.Session() as sess:
tf.global_variables_initializer().run()
legal_actions = [0,3,4]
batch_dataSingle = np.zeros((1,210,160,3))
while not gameOver:
indexNum = 0
raw_RGB = [] #in format (210 X 160 x 3). This is saved so that the parameters can be reevaluated considering the entire point of play. It is reset after each point.
reward = 0
while reward == 0: #Play game and store screenshots until point is scored
raw_RGB.append(ale.getScreenRGB())
batch_dataSingle[0,:,:,:] = raw_RGB[screenNum]
screenNum = screenNum+1
feed_dict = {train_data_node: batch_dataSingle}
proj_out = (sess.run(logits, feed_dict=feed_dict))
indexMax =np.argmax(proj_out[0]) #Gives optimal action from weights
for i in range(4)
reward = result from running with projected action
if reward != 0:
break
#We continue to choose a new action based on the current board until a reward is received
batch_labels = reward
n = len(raw_RGB)
batch_data = np.zeros((n,210,160,3))
for i in range(n):
batch_data[i,:,:,:] = raw_RGB[i]
feed_dict = {train_data_node: batch_data, reward_node:batch_labels}
sess.run(optimizer, feed_dict=feed_dict)
Optimizer algorithm:
logits = model(train_data_node, True)
loss = tf.log(tf.reduce_max(logits)) * reward_node * -1
batch = tf.Variable(0, dtype=data_type())
learning_rate = .1
optimizer = tf.train.MomentumOptimizer(learning_rate,0.9).minimize(loss, global_step=batch)