I'm trying to recreate very simple example of Policy Gradient, from it's origin resource Andrej Karpathy blog. In that articale, you will find example with CartPole and Policy Gradient with list of weight and Softmax activation. Here is my recreated and very simple example of CartPole policy gradient, which works perfect.
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
import copy
NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99
# noinspection PyMethodMayBeStatic
class Agent:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.rand(5, 2)
def policy(self, state):
z = state.dot(self.w)
exp = np.exp(z)
return exp/np.sum(exp)
def __softmax_grad(self, softmax):
s = softmax.reshape(-1,1)
return np.diagflat(s) - np.dot(s, s.T)
def grad(self, probs, action, state):
dsoftmax = self.__softmax_grad(probs)[action,:]
dlog = dsoftmax / probs[0,action]
grad = state.T.dot(dlog[None,:])
return grad
def update_with(self, grads, rewards):
for i in range(len(grads)):
# Loop through everything that happend in the episode
# and update towards the log policy gradient times **FUTURE** reward
total_grad_effect = 0
for t, r in enumerate(rewards[i:]):
total_grad_effect += r * (GAMMA ** r)
self.w += LEARNING_RATE * grads[i] * total_grad_effect
print("Grads update: " + str(np.sum(grads[i])))
def main(argv):
env = gym.make('CartPole-v0')
np.random.seed(1)
agent = Agent()
complete_scores = []
for e in range(NUM_EPISODES):
state = env.reset()[None, :]
state = agent.poly.fit_transform(state)
rewards = []
grads = []
score = 0
while True:
probs = agent.policy(state)
action_space = env.action_space.n
action = np.random.choice(action_space, p=probs[0])
next_state, reward, done,_ = env.step(action)
next_state = next_state[None,:]
next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
grad = agent.grad(probs, action, state)
grads.append(grad)
rewards.append(reward)
score += reward
state = next_state
if done:
break
agent.update_with(grads, rewards)
complete_scores.append(score)
env.close()
plt.plot(np.arange(NUM_EPISODES),
complete_scores)
plt.savefig('image1.png')
if __name__ == '__main__':
main(None)
.
.
Question
I'm trying to do, almost the same example but with Sigmoid activation (just for simplicity). That is all I need to do. Switch activation in the model from softmax to the sigmoid. Which should work for sure (based on explanation below). But my Policy Gradient model don't learn anything and keeps random. Any suggestion?
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
NUM_EPISODES = 4000
LEARNING_RATE = 0.000025
GAMMA = 0.99
# noinspection PyMethodMayBeStatic
class Agent:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.rand(5, 1) - 0.5
# Our policy that maps state to action parameterized by w
# noinspection PyShadowingNames
def policy(self, state):
z = np.sum(state.dot(self.w))
return self.sigmoid(z)
def sigmoid(self, x):
s = 1 / (1 + np.exp(-x))
return s
def sigmoid_grad(self, sig_x):
return sig_x * (1 - sig_x)
def grad(self, probs, action, state):
dsoftmax = self.sigmoid_grad(probs)
dlog = dsoftmax / probs
grad = state.T.dot(dlog)
grad = grad.reshape(5, 1)
return grad
def update_with(self, grads, rewards):
if len(grads) < 50:
return
for i in range(len(grads)):
# Loop through everything that happened in the episode
# and update towards the log policy gradient times **FUTURE** reward
total_grad_effect = 0
for t, r in enumerate(rewards[i:]):
total_grad_effect += r * (GAMMA ** r)
self.w += LEARNING_RATE * grads[i] * total_grad_effect
def main(argv):
env = gym.make('CartPole-v0')
np.random.seed(1)
agent = Agent()
complete_scores = []
for e in range(NUM_EPISODES):
state = env.reset()[None, :]
state = agent.poly.fit_transform(state)
rewards = []
grads = []
score = 0
while True:
probs = agent.policy(state)
action_space = env.action_space.n
action = np.random.choice(action_space, p=[1 - probs, probs])
next_state, reward, done, _ = env.step(action)
next_state = next_state[None, :]
next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
grad = agent.grad(probs, action, state)
grads.append(grad)
rewards.append(reward)
score += reward
state = next_state
if done:
break
agent.update_with(grads, rewards)
complete_scores.append(score)
env.close()
plt.plot(np.arange(NUM_EPISODES),
complete_scores)
plt.savefig('image1.png')
if __name__ == '__main__':
main(None)
Plotting all the learning keeps random. Nothing helps with tuning hyper parameters. Below the sample image.
References:
Deep Reinforcement Learning: Pong from Pixels
An introduction to Policy Gradients with Cartpole and Doom
Deriving Policy Gradients and Implementing REINFORCE
Machine Learning Trick of the Day (5): Log Derivative Trick 12
UPDATE
Seems like answer below could doing some work from the graphic. But it's Not log probability, and Not even gradient of the policy. And changes whole purpose of the RL Gradient Policy. Please check references above. Following the image we next statement.
I need to take a Gradient of the Log function of my Policy (which is simply weights and sigmoid activation). Please let me know in case of any questions.
The problem is with grad method.
def grad(self, probs, action, state):
dsoftmax = self.sigmoid_grad(probs)
dlog = dsoftmax / probs
grad = state.T.dot(dlog)
grad = grad.reshape(5, 1)
return grad
In the original code Softmax was used along with CrossEntropy loss function. When you switch activation to Sigmoid, the proper loss function becomes Binary CrossEntropy. Now, the purpose of grad method is to calculate gradient of the loss function wrt. weights. Sparing the details, proper gradient is given by (probs - action) * state in the terminology of your program. The last thing is to add minus sign - we want to maximize the negative of the loss function.
Proper grad method thus:
def grad(self, probs, action, state):
grad = state.T.dot(probs - action)
return -grad
Another change that you might want to add is to increase learning rate.
LEARNING_RATE = 0.0001 and NUM_EPISODES = 5000 will produce the following plot:
The convergence will be much faster if weights are initialized using Gaussian distribution with zero mean and small variance:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.randn(5, 1) * 0.01
UPDATE
Added complete code to reproduce the results:
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
NUM_EPISODES = 5000
LEARNING_RATE = 0.0001
GAMMA = 0.99
# noinspection PyMethodMayBeStatic
class Agent:
def __init__(self):
self.poly = PolynomialFeatures(1)
self.w = np.random.randn(5, 1) * 0.01
# Our policy that maps state to action parameterized by w
# noinspection PyShadowingNames
def policy(self, state):
z = np.sum(state.dot(self.w))
return self.sigmoid(z)
def sigmoid(self, x):
s = 1 / (1 + np.exp(-x))
return s
def sigmoid_grad(self, sig_x):
return sig_x * (1 - sig_x)
def grad(self, probs, action, state):
grad = state.T.dot(probs - action)
return -grad
def update_with(self, grads, rewards):
if len(grads) < 50:
return
for i in range(len(grads)):
# Loop through everything that happened in the episode
# and update towards the log policy gradient times **FUTURE** reward
total_grad_effect = 0
for t, r in enumerate(rewards[i:]):
total_grad_effect += r * (GAMMA ** r)
self.w += LEARNING_RATE * grads[i] * total_grad_effect
def main(argv):
env = gym.make('CartPole-v0')
np.random.seed(1)
agent = Agent()
complete_scores = []
for e in range(NUM_EPISODES):
state = env.reset()[None, :]
state = agent.poly.fit_transform(state)
rewards = []
grads = []
score = 0
while True:
probs = agent.policy(state)
action_space = env.action_space.n
action = np.random.choice(action_space, p=[1 - probs, probs])
next_state, reward, done, _ = env.step(action)
next_state = next_state[None, :]
next_state = agent.poly.fit_transform(next_state.reshape(1, 4))
grad = agent.grad(probs, action, state)
grads.append(grad)
rewards.append(reward)
score += reward
state = next_state
if done:
break
agent.update_with(grads, rewards)
complete_scores.append(score)
env.close()
plt.plot(np.arange(NUM_EPISODES),
complete_scores)
plt.savefig('image1.png')
if __name__ == '__main__':
main(None)
Related
I've been trying to use a Q-Learning based approach to CartPole, but with integration of neural networks.
Here's my code:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from gym.utils.play import play
env = gym.make("CartPole-v1")
env2 = gym.make("CartPole-v1",render_mode="human")
inputs = tf.keras.Input(shape=(4,))
x = tf.keras.layers.Dense(5, activation=tf.nn.relu)(inputs)
x = tf.keras.layers.Dense(5, activation=tf.nn.relu)(x)
outputs = tf.keras.layers.Dense(2, activation=tf.nn.softmax)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
epsilon = 0.5 # Chance of taking a random action
discount = 0.12
learningRate = 0.8
batchSize = 10 # Number of episodes to train on at once
epCount = 1000 # Number of episodes
decayMult = (0.001/epsilon)**(1/epCount)
disMult = (0.395/discount)**(2/epCount)
mode = 'S' # Switch between Q-Learning and SARSA-Learning
optimizer = tf.optimizers.SGD(learning_rate=learningRate)
mse_loss = tf.keras.losses.MeanSquaredError()
def getNextAction(state):
if np.random.random() > epsilon:
return tf.argmax(model(state),axis=-1).numpy()[0]
else:
return np.random.randint(2)
def customLoss(state,state2,reward,action):
oldQ = model(state)[0][action]
newQ = reward + discount*(tf.reduce_max(model(state2),axis=-1))
mse = mse_loss(tf.reshape(newQ,[1]),tf.reshape(oldQ,[1]))
return tf.keras.backend.mean(mse)
'''def updateS(state,state2,reward,action,action2):
oldSARSA = q_values[state,action]
return oldSARSA + learningRate*(reward + discount*(q_values[state2,action2]) - oldSARSA)'''
rewardArr = [] # Array of rewards
lossArr = [] # Array of losses
epArr = [] # Array of episode numbers
for _ in range(epCount):
terminated = False
observation, info = env.reset() # Initializes/resets environment, initializes observation and info values with base values
rewSum = 0
lossSum = 0
observation = tf.convert_to_tensor(observation.reshape((1,4)))
while(terminated == False):
state = observation
with tf.GradientTape() as tape:
actionIndex = getNextAction(observation)
observation, reward, terminated, truncated, info = env.step(actionIndex)
observation = tf.convert_to_tensor(observation.reshape((1,4)))
loss = customLoss(state,observation,reward,actionIndex)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
rewSum += reward
lossSum += loss.numpy()
lossArr.append(lossSum)
rewardArr.append(rewSum)
epArr.append(_+1)
#if _ <= epCount/2:
#discount*=disMult
print(f"\rEpisode: {_+1}",end="")
'''if (_+1) % checkWait == 0:
print(f"Score: {rewardAvg/checkWait}\nLoss: {lossAvg/checkWait}\nEpisode: {_+1}")
print( "=========================")
rewardAvg = 0
lossAvg = 0'''
plt.subplot(121)
plt.plot(rewardArr,'r-',label='score',linewidth=1)
plt.xlabel('Episode')
plt.legend()
plt.subplot(122)
plt.plot(lossArr,'b-',label='loss',linewidth=1)
plt.xlabel('Episode')
plt.legend()
plt.show()
#visualization
terminated = False
observation, info = env2.reset()
observation = tf.convert_to_tensor(observation.reshape((1,4)))
while(terminated == False):
actionIndex = tf.argmax(model(observation),axis=-1).numpy()[0]
observation, reward, terminated, truncated, info = env2.step(actionIndex)
observation = tf.convert_to_tensor(observation.reshape((1,4)))
I've tried changing discount value over time, which resulted in an increasing loss the higher my discount value.
I've tried adding more layers and/or increasing layer dimensions, which didn't seem to change anything.
I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
# initialize the variance for OU process for exploring policies
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
# compute TD error i.e actual - predicted values
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
# train the critic network with adam optimizer
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
# saver
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
def build_actor_network(self, s, scope, trainable):
# Actor DPG
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 150:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break
I solved this problem completely by rewriting the code and adding the learning function in a separate session
I am trying to solve the CartPole problem in openAI gym by training a simple 2 layer NN in pytorch. The method used is DQN yet results converge on a maximum score of around 8 or 9 and is not seen to improve over time or with training. Instead the score gets lower with training. How this might be improved/what is wrong in the code that is making it do this? Below is the code used:
import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from collections import namedtuple
import numpy as np
class network(nn.Module):
def __init__(self):
nn.Module.__init__(self)
# network takes 4 inputs (state, action, next_state, reward), hidden layer then has
# 256 inputs and the network has 2 outputs (the q value of going left or right)
# in this network the index of the output references the action.
self.l1 = nn.Linear(4, 256)
self.l2 = nn.Linear(256, 2)
def forward(self, x):
# forward function defines how the model will run
x = F.relu(self.l1(x))
x = self.l2(x)
return (x)
class replay_memory():
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
def save(self, transition):
# saves all transitions for the environment in a tensor
self.memory.append(transition)
if len(self.memory) > self.capacity:
del self.memory[0]
def sample(self, batch_size):
# generates a random sample from the memory
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
class agent():
def __init__(self, env, model):
self.epsilon = 1 # exploration rate
self.epsilon_min = 0.001 # smallest exploration value
self.epsilon_decay = 0.995 # rate at which exploration occurs
self.learning_rate = 0.001
def act(self, state, model):
# define actions, random or optimal based on exploration rate DOES NOT ACCOUNT FOR THE DECAY
if random.uniform(0, 1) <= self.epsilon:
action = torch.LongTensor([[random.randrange(2)]])
action_np = (action.numpy())[0][0]
else:
action = model(Variable(torch.FloatTensor([state])).type(torch.FloatTensor)).max(1)[1].view(1,1)
action_np = (action.numpy())[0][0]
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
else:
self.epsilon = self.epsilon_min
return action, action_np
def trained_act(self, episodes, network, env):
for e in range (episodes):
state = env.reset()
for t in range (200):
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
env.render()
if done:
break
print(t)
env.close()
def learn(batch_size, gamma, memory, optimizer):
BATCH_SIZE = batch_size
if len(memory) < BATCH_SIZE:
return
# random transition batch is taken from experience replay memory.
transitions = memory.sample(BATCH_SIZE)
batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)
batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))
current_q_values = network.forward(batch_state).gather(1, batch_action.unsqueeze(-1))
max_next_q_values = network.forward(batch_next_state).detach().max(1)[0]
expected_q_values = batch_reward + (gamma * max_next_q_values)
# loss is measured from error between current and newly expected Q values
loss = F.smooth_l1_loss(expected_q_values, current_q_values)
# backpropagation of loss to NN
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss
env = gym.make('CartPole-v0')
env.reset()
network = network()
agent = agent(env, network)
batch_size = 50
episode = 500
T = 200
gamma = 0.95
memory = replay_memory(100)
optimizer = optim.SGD(network.parameters(), 0.001)
l = []
s = []
for e in range (episode):
state = env.reset()
for t in range (T):
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
if done:
reward = -2
transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])
memory.save(transition)
state = next_state
loss = learn(batch_size, gamma, memory, optimizer)
l.append(loss)
if done:
print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
s.append(t)
break
I would rewrite your training algorithm to:
for e in range (episode):
state = env.reset()
done = False
t = 0
while not done:
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])
memory.save(transition)
state = next_state
loss = learn(batch_size, gamma, memory, optimizer)
l.append(loss)
if t < T:
t += 1
else:
done = True
if done:
print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
s.append(t)
break
Hi I'm trying to train a DQN to solve gym's Cartpole problem.
For some reason the Loss looks like this (orange line). Can y'all take a look at my code and help with this? I've played around with the hyperparameters a decent bit so I don't think they're the issue here.
class DQN(nn.Module):
def __init__(self, input_dim, output_dim):
super(DQN, self).__init__()
self.linear1 = nn.Linear(input_dim, 16)
self.linear2 = nn.Linear(16, 32)
self.linear3 = nn.Linear(32, 32)
self.linear4 = nn.Linear(32, output_dim)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
return self.linear4(x)
final_epsilon = 0.05
initial_epsilon = 1
epsilon_decay = 5000
global steps_done
steps_done = 0
def select_action(state):
global steps_done
sample = random.random()
eps_threshold = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
if sample > eps_threshold:
with torch.no_grad():
state = torch.Tensor(state)
steps_done += 1
q_calc = model(state)
node_activated = int(torch.argmax(q_calc))
return node_activated
else:
node_activated = random.randint(0,1)
steps_done += 1
return node_activated
class ReplayMemory(object): # Stores [state, reward, action, next_state, done]
def __init__(self, capacity):
self.capacity = capacity
self.memory = [[],[],[],[],[]]
def push(self, data):
"""Saves a transition."""
for idx, point in enumerate(data):
#print("Col {} appended {}".format(idx, point))
self.memory[idx].append(point)
def sample(self, batch_size):
rows = random.sample(range(0, len(self.memory[0])), batch_size)
experiences = [[],[],[],[],[]]
for row in rows:
for col in range(5):
experiences[col].append(self.memory[col][row])
return experiences
def __len__(self):
return len(self.memory[0])
input_dim, output_dim = 4, 2
model = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(model.state_dict())
target_net.eval()
tau = 2
discount = 0.99
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
memory = ReplayMemory(65536)
BATCH_SIZE = 128
def optimize_model():
if len(memory) < BATCH_SIZE:
return 0
experiences = memory.sample(BATCH_SIZE)
state_batch = torch.Tensor(experiences[0])
action_batch = torch.LongTensor(experiences[1]).unsqueeze(1)
reward_batch = torch.Tensor(experiences[2])
next_state_batch = torch.Tensor(experiences[3])
done_batch = experiences[4]
pred_q = model(state_batch).gather(1, action_batch)
next_state_q_vals = torch.zeros(BATCH_SIZE)
for idx, next_state in enumerate(next_state_batch):
if done_batch[idx] == True:
next_state_q_vals[idx] = -1
else:
# .max in pytorch returns (values, idx), we only want vals
next_state_q_vals[idx] = (target_net(next_state_batch[idx]).max(0)[0]).detach()
better_pred = (reward_batch + next_state_q_vals).unsqueeze(1)
loss = F.smooth_l1_loss(pred_q, better_pred)
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
return loss
points = []
losspoints = []
#save_state = torch.load("models/DQN_target_11.pth")
#model.load_state_dict(save_state['state_dict'])
#optimizer.load_state_dict(save_state['optimizer'])
env = gym.make('CartPole-v0')
for i_episode in range(5000):
observation = env.reset()
episode_loss = 0
if episode % tau == 0:
target_net.load_state_dict(model.state_dict())
for t in range(1000):
#env.render()
state = observation
action = select_action(observation)
observation, reward, done, _ = env.step(action)
if done:
next_state = [0,0,0,0]
else:
next_state = observation
memory.push([state, action, reward, next_state, done])
episode_loss = episode_loss + float(optimize_model(i_episode))
if done:
points.append((i_episode, t+1))
print("Episode {} finished after {} timesteps".format(i_episode, t+1))
print("Avg Loss: ", episode_loss / (t+1))
losspoints.append((i_episode, episode_loss / (t+1)))
if (i_episode % 100 == 0):
eps = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
print(eps)
if ((i_episode+1) % 5001 == 0):
save = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
torch.save(save, "models/DQN_target_" + str(i_episode // 5000) + ".pth")
break
env.close()
x = [coord[0] * 100 for coord in points]
y = [coord[1] for coord in points]
x2 = [coord[0] * 100 for coord in losspoints]
y2 = [coord[1] for coord in losspoints]
plt.plot(x, y)
plt.plot(x2, y2)
plt.show()
I basically followed the tutorial pytorch has, except using the state returned by the env rather than the pixels. I also changed the replay memory because I was having issues there. Other than that, I left everything else pretty much the same.
Edit:
I tried overfitting on a small batch and the Loss looks like this without updating the target net and this when updating it
Edit 2:
This is definitely an issue with the target net, I tried removing it and loss seemed to not increase exponentially
Your tau value is too small, small target network update cause DQN traning unstable. You can try to use 1000 (OpenAI Baseline's DQN example) or 10000 (Deepmind's Nature paper).
In Deepmind's 2015 Nature paper, it states that:
The second modification to online Q-learning aimed at further improving the stability of our method with neural networks is to use a separate network for generating the traget yj in the Q-learning update. More precisely, every C updates we clone the network Q to obtain a target network Q' and use Q' for generating the Q-learning targets yj for the following C updates to Q.
This modification makes the algorithm more stable compared to standard online Q-learning, where an update that increases Q(st,at) often also increases Q(st+1, a) for all a and hence also increases the target yj, possibly leading to oscillations or divergence of the policy. Generating the targets using the older set of parameters adds a delay between the time an update to Q is made and the time the update affects the targets yj, making divergence or oscillations much more unlikely.
Human-level control through deep reinforcement
learning, Mnih et al., 2015
I've run your code with settings of tau=2, tau=10, tau=100, tau=1000 and tau=10000. The update frequency of tau=100 solves the problem (reach maximum steps of 200).
tau=2
tau=10
tau=100
tau=1000
tau=10000
Below is the modified version of your code.
import random
import math
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import gym
class DQN(nn.Module):
def __init__(self, input_dim, output_dim):
super(DQN, self).__init__()
self.linear1 = nn.Linear(input_dim, 16)
self.linear2 = nn.Linear(16, 32)
self.linear3 = nn.Linear(32, 32)
self.linear4 = nn.Linear(32, output_dim)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
return self.linear4(x)
final_epsilon = 0.05
initial_epsilon = 1
epsilon_decay = 5000
global steps_done
steps_done = 0
def select_action(state):
global steps_done
sample = random.random()
eps_threshold = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
if sample > eps_threshold:
with torch.no_grad():
state = torch.Tensor(state)
steps_done += 1
q_calc = model(state)
node_activated = int(torch.argmax(q_calc))
return node_activated
else:
node_activated = random.randint(0,1)
steps_done += 1
return node_activated
class ReplayMemory(object): # Stores [state, reward, action, next_state, done]
def __init__(self, capacity):
self.capacity = capacity
self.memory = [[],[],[],[],[]]
def push(self, data):
"""Saves a transition."""
for idx, point in enumerate(data):
#print("Col {} appended {}".format(idx, point))
self.memory[idx].append(point)
def sample(self, batch_size):
rows = random.sample(range(0, len(self.memory[0])), batch_size)
experiences = [[],[],[],[],[]]
for row in rows:
for col in range(5):
experiences[col].append(self.memory[col][row])
return experiences
def __len__(self):
return len(self.memory[0])
input_dim, output_dim = 4, 2
model = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(model.state_dict())
target_net.eval()
tau = 100
discount = 0.99
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
memory = ReplayMemory(65536)
BATCH_SIZE = 128
def optimize_model():
if len(memory) < BATCH_SIZE:
return 0
experiences = memory.sample(BATCH_SIZE)
state_batch = torch.Tensor(experiences[0])
action_batch = torch.LongTensor(experiences[1]).unsqueeze(1)
reward_batch = torch.Tensor(experiences[2])
next_state_batch = torch.Tensor(experiences[3])
done_batch = experiences[4]
pred_q = model(state_batch).gather(1, action_batch)
next_state_q_vals = torch.zeros(BATCH_SIZE)
for idx, next_state in enumerate(next_state_batch):
if done_batch[idx] == True:
next_state_q_vals[idx] = -1
else:
# .max in pytorch returns (values, idx), we only want vals
next_state_q_vals[idx] = (target_net(next_state_batch[idx]).max(0)[0]).detach()
better_pred = (reward_batch + next_state_q_vals).unsqueeze(1)
loss = F.smooth_l1_loss(pred_q, better_pred)
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
return loss
points = []
losspoints = []
#save_state = torch.load("models/DQN_target_11.pth")
#model.load_state_dict(save_state['state_dict'])
#optimizer.load_state_dict(save_state['optimizer'])
env = gym.make('CartPole-v0')
for i_episode in range(5000):
observation = env.reset()
episode_loss = 0
if i_episode % tau == 0:
target_net.load_state_dict(model.state_dict())
for t in range(1000):
#env.render()
state = observation
action = select_action(observation)
observation, reward, done, _ = env.step(action)
if done:
next_state = [0,0,0,0]
else:
next_state = observation
memory.push([state, action, reward, next_state, done])
episode_loss = episode_loss + float(optimize_model())
if done:
points.append((i_episode, t+1))
print("Episode {} finished after {} timesteps".format(i_episode, t+1))
print("Avg Loss: ", episode_loss / (t+1))
losspoints.append((i_episode, episode_loss / (t+1)))
if (i_episode % 100 == 0):
eps = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
print(eps)
if ((i_episode+1) % 5001 == 0):
save = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
torch.save(save, "models/DQN_target_" + str(i_episode // 5000) + ".pth")
break
env.close()
x = [coord[0] * 100 for coord in points]
y = [coord[1] for coord in points]
x2 = [coord[0] * 100 for coord in losspoints]
y2 = [coord[1] for coord in losspoints]
plt.plot(x, y)
plt.plot(x2, y2)
plt.show()
Here's the result of your plotting code.
tau=100
tau=10000
So basically, if the weights, which are initially assigned randomly, are within a range, of about 0.4 to -0.4 for w0 for example, then they will change and the accuracy will improve. However, if the weights are assigned a random number that is outside this range, then they won't be changed at all. I can't figure it out. Any suggestions would be great :)
import numpy as np
from matplotlib import pyplot as plt
class NN_model:
def __init__(self, data):
self.data = data
self.w0 = np.random.randn()
self.w1 = np.random.randn()
self.bias = np.random.randn()
self.trained = False
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_p(self, x):
return self.sigmoid(x) * (1 - self.sigmoid(x))
def calculate_loss(self, pred, target):
loss = np.square(target - pred)
return loss
def training_loop(self, data):
costs = []
learning_rate = 0.005
print(self.w0)
if self.trained == True:
print('Model already trained')
pass
else:
for i in range(50):
for i in range(100000):
ri = np.random.randint(len(data))
point = data[ri]
sig_out = ((self.w0 * point[0]) + (self.w1 * point[1]) + self.bias)
pred = self.sigmoid(sig_out)
cost = self.calculate_loss(pred, point[2])
costs.append(cost)
dcost_dpred = 2 * (pred - point[2])
dpred_dsigout = self.sigmoid_p(sig_out)
dsigout_dw0 = point[0]
dsigout_dw1 = point[1]
dsigout_dbias = 1
dcost_dw0 = dcost_dpred * dpred_dsigout * dsigout_dw0
dcost_dw1 = dcost_dpred * dpred_dsigout * dsigout_dw1
dcost_dbias = dcost_dpred * dpred_dsigout * dsigout_dbias
self.w0 += - learning_rate * dcost_dw0
self.w1 += - learning_rate * dcost_dw0
self.bias += - learning_rate * dcost_dbias
print(self.w0, self.w1, self.bias)
#-0.0752623452445784 0.2447376547554179 4.032995041915469
#-0.3042823068224879 0.015717693177505765 18.643149928253827
self.trained = True
plt.plot(costs)
plt.show()
def predict(self, test_data):
if self.trained == True:
pred = self.sigmoid( (test_data[0] * self.w0) + (test_data[1] * self.w1) + self.bias )
print(pred)
if pred > 0.5:
print('Woman')
else:
print('Man')
else:
print('Error: Model has not been trained yet')
You have to have intuitive understanding of how the neural networks work behind the scene. There is no rule of thumb to decide the best values for your weights.
It might be possible that your initial weights pulls your near to accuracy really fast but in your example it might seem that while taking weights in a wide range with learning rate = 0.005, it will take some time until your weights are reduced. Therefore learning rates also play crucial role. Try tweaking that as well.