Machine Learning reward artificially capping - python

So when I run this, it works perfectly, however, for some reason the reward caps at 200. I'm not sure what could be causing this. I'm new to machine learning and this is my first project, so sorry if I am missing something stupid.I hypothesize that done is triggering before I want it too, but playing with that hasn't led to anything. Thanks so much.
import gym
import tensorflow as tf
import numpy as np
import os
import sys
env = gym.make('CartPole-v0')
discount_rate=.95
# TODO Build the policy gradient neural network
class Agent:
def __init__(self, num_actions, state_size):
initializer = tf.contrib.layers.xavier_initializer()
self.input_layer = tf.placeholder(dtype=tf.float32, shape=[None, state_size])
# Neural net starts here
hidden_layer = tf.layers.dense(self.input_layer, 8, activation=tf.nn.relu, kernel_initializer=initializer)
hidden_layer_2 = tf.layers.dense(hidden_layer, 8, activation=tf.nn.relu, kernel_initializer=initializer)
# Output of neural net
out = tf.layers.dense(hidden_layer_2, num_actions, activation=None)
self.outputs = tf.nn.softmax(out)
self.choice = tf.argmax(self.outputs, axis=1)
# Training Procedure
self.rewards = tf.placeholder(shape=[None, ], dtype=tf.float32)
self.actions = tf.placeholder(shape=[None, ], dtype=tf.int32)
one_hot_actions = tf.one_hot(self.actions, num_actions)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=one_hot_actions)
self.loss = tf.reduce_mean(cross_entropy * self.rewards)
self.gradients = tf.gradients(self.loss, tf.trainable_variables())
# Create a placeholder list for gradients
self.gradients_to_apply = []
for index, variable in enumerate(tf.trainable_variables()):
gradient_placeholder = tf.placeholder(tf.float32)
self.gradients_to_apply.append(gradient_placeholder)
# Create the operation to update gradients with the gradients placeholder.
optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
self.update_gradients =
optimizer.apply_gradients(zip(self.gradients_to_apply, tf.trainable_variables()))
def discount_normalize_rewards(rewards):
discounted_rewards = np.zeros_like(rewards)
total_rewards = 0
for i in reversed(range(len(rewards))):
total_rewards = total_rewards * discount_rate + rewards[i]
discounted_rewards[i] = total_rewards
discounted_rewards -= np.mean(discounted_rewards)
discounted_rewards /= np.std(discounted_rewards)
return discounted_rewards
#initialize the training loop
tf.reset_default_graph()
# Modify these to match shape of actions and states in your environment
num_actions = 2
state_size = 4
path = "./cartpole-pg/"
training_episodes = 1000
max_steps_per_episode = 20000
episode_batch_size = 5
agent = Agent(num_actions, state_size)
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=2)
if not os.path.exists(path):
os.makedirs(path)
with tf.Session() as sess:
sess.run(init)
total_episode_rewards = []
# Create a buffer of 0'd gradients
gradient_buffer = sess.run(tf.trainable_variables())
for index, gradient in enumerate(gradient_buffer):
gradient_buffer[index] = gradient * 0
for episode in range(training_episodes):
state = env.reset()
episode_history = []
episode_rewards = 0
for step in range(max_steps_per_episode):
if episode % 100 == 0:
env.render()
# Get weights for each action
action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [state]})
action_choice = np.random.choice(range(num_actions), p=action_probabilities[0])
state_next, reward, done, _ = env.step(action_choice)
episode_history.append([state, action_choice, reward, state_next])
state = state_next
episode_rewards += reward
if done:
total_episode_rewards.append(episode_rewards)
episode_history = np.array(episode_history)
episode_history[:,2] = discount_normalize_rewards(episode_history[:,2])
ep_gradients = sess.run(agent.gradients, feed_dict={agent.input_layer: np.vstack(episode_history[:, 0]),
agent.actions: episode_history[:, 1],
agent.rewards: episode_history[:, 2]})
# add the gradients to the grad buffer:
for index, gradient in enumerate(ep_gradients):
gradient_buffer[index] += gradient
break
if episode % episode_batch_size == 0:
feed_dict_gradients = dict(zip(agent.gradients_to_apply, gradient_buffer))
sess.run(agent.update_gradients, feed_dict=feed_dict_gradients)
for index, gradient in enumerate(gradient_buffer):
gradient_buffer[index] = gradient * 0
if episode % 1 == 0:
saver.save(sess, path + "pg-checkpoint", episode)
print("Reward: " + str(total_episode_rewards[-1:]))
env.close()

Episodes for Cartpole terminate when the pole falls and at 200 successful steps. See the max_episode_steps in the linked file if you want to change this. The reason there is a 200 step max is to make evaluating trials easier (ie you always get episode ends so you can evaluate episode stats) and so that the environment doesn't get stuck in a never ending trial.
register(
id='CartPole-v0',
entry_point='gym.envs.classic_control:CartPoleEnv',
max_episode_steps=200,
reward_threshold=195.0,)

Related

Why isn't my CartPole code working? I'm using a neural network with a loss function based off of the Bellman equation

I've been trying to use a Q-Learning based approach to CartPole, but with integration of neural networks.
Here's my code:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from gym.utils.play import play
env = gym.make("CartPole-v1")
env2 = gym.make("CartPole-v1",render_mode="human")
inputs = tf.keras.Input(shape=(4,))
x = tf.keras.layers.Dense(5, activation=tf.nn.relu)(inputs)
x = tf.keras.layers.Dense(5, activation=tf.nn.relu)(x)
outputs = tf.keras.layers.Dense(2, activation=tf.nn.softmax)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
epsilon = 0.5 # Chance of taking a random action
discount = 0.12
learningRate = 0.8
batchSize = 10 # Number of episodes to train on at once
epCount = 1000 # Number of episodes
decayMult = (0.001/epsilon)**(1/epCount)
disMult = (0.395/discount)**(2/epCount)
mode = 'S' # Switch between Q-Learning and SARSA-Learning
optimizer = tf.optimizers.SGD(learning_rate=learningRate)
mse_loss = tf.keras.losses.MeanSquaredError()
def getNextAction(state):
if np.random.random() > epsilon:
return tf.argmax(model(state),axis=-1).numpy()[0]
else:
return np.random.randint(2)
def customLoss(state,state2,reward,action):
oldQ = model(state)[0][action]
newQ = reward + discount*(tf.reduce_max(model(state2),axis=-1))
mse = mse_loss(tf.reshape(newQ,[1]),tf.reshape(oldQ,[1]))
return tf.keras.backend.mean(mse)
'''def updateS(state,state2,reward,action,action2):
oldSARSA = q_values[state,action]
return oldSARSA + learningRate*(reward + discount*(q_values[state2,action2]) - oldSARSA)'''
rewardArr = [] # Array of rewards
lossArr = [] # Array of losses
epArr = [] # Array of episode numbers
for _ in range(epCount):
terminated = False
observation, info = env.reset() # Initializes/resets environment, initializes observation and info values with base values
rewSum = 0
lossSum = 0
observation = tf.convert_to_tensor(observation.reshape((1,4)))
while(terminated == False):
state = observation
with tf.GradientTape() as tape:
actionIndex = getNextAction(observation)
observation, reward, terminated, truncated, info = env.step(actionIndex)
observation = tf.convert_to_tensor(observation.reshape((1,4)))
loss = customLoss(state,observation,reward,actionIndex)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
rewSum += reward
lossSum += loss.numpy()
lossArr.append(lossSum)
rewardArr.append(rewSum)
epArr.append(_+1)
#if _ <= epCount/2:
#discount*=disMult
print(f"\rEpisode: {_+1}",end="")
'''if (_+1) % checkWait == 0:
print(f"Score: {rewardAvg/checkWait}\nLoss: {lossAvg/checkWait}\nEpisode: {_+1}")
print( "=========================")
rewardAvg = 0
lossAvg = 0'''
plt.subplot(121)
plt.plot(rewardArr,'r-',label='score',linewidth=1)
plt.xlabel('Episode')
plt.legend()
plt.subplot(122)
plt.plot(lossArr,'b-',label='loss',linewidth=1)
plt.xlabel('Episode')
plt.legend()
plt.show()
#visualization
terminated = False
observation, info = env2.reset()
observation = tf.convert_to_tensor(observation.reshape((1,4)))
while(terminated == False):
actionIndex = tf.argmax(model(observation),axis=-1).numpy()[0]
observation, reward, terminated, truncated, info = env2.step(actionIndex)
observation = tf.convert_to_tensor(observation.reshape((1,4)))
I've tried changing discount value over time, which resulted in an increasing loss the higher my discount value.
I've tried adding more layers and/or increasing layer dimensions, which didn't seem to change anything.

How can I save DDPG model?

I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
# initialize the variance for OU process for exploring policies
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
# compute TD error i.e actual - predicted values
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
# train the critic network with adam optimizer
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
# saver
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
def build_actor_network(self, s, scope, trainable):
# Actor DPG
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 150:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break
I solved this problem completely by rewriting the code and adding the learning function in a separate session

Score being minimised by pytorch NN with the Cartpole problem

I am trying to solve the CartPole problem in openAI gym by training a simple 2 layer NN in pytorch. The method used is DQN yet results converge on a maximum score of around 8 or 9 and is not seen to improve over time or with training. Instead the score gets lower with training. How this might be improved/what is wrong in the code that is making it do this? Below is the code used:
import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from collections import namedtuple
import numpy as np
class network(nn.Module):
def __init__(self):
nn.Module.__init__(self)
# network takes 4 inputs (state, action, next_state, reward), hidden layer then has
# 256 inputs and the network has 2 outputs (the q value of going left or right)
# in this network the index of the output references the action.
self.l1 = nn.Linear(4, 256)
self.l2 = nn.Linear(256, 2)
def forward(self, x):
# forward function defines how the model will run
x = F.relu(self.l1(x))
x = self.l2(x)
return (x)
class replay_memory():
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
def save(self, transition):
# saves all transitions for the environment in a tensor
self.memory.append(transition)
if len(self.memory) > self.capacity:
del self.memory[0]
def sample(self, batch_size):
# generates a random sample from the memory
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
class agent():
def __init__(self, env, model):
self.epsilon = 1 # exploration rate
self.epsilon_min = 0.001 # smallest exploration value
self.epsilon_decay = 0.995 # rate at which exploration occurs
self.learning_rate = 0.001
def act(self, state, model):
# define actions, random or optimal based on exploration rate DOES NOT ACCOUNT FOR THE DECAY
if random.uniform(0, 1) <= self.epsilon:
action = torch.LongTensor([[random.randrange(2)]])
action_np = (action.numpy())[0][0]
else:
action = model(Variable(torch.FloatTensor([state])).type(torch.FloatTensor)).max(1)[1].view(1,1)
action_np = (action.numpy())[0][0]
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
else:
self.epsilon = self.epsilon_min
return action, action_np
def trained_act(self, episodes, network, env):
for e in range (episodes):
state = env.reset()
for t in range (200):
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
env.render()
if done:
break
print(t)
env.close()
def learn(batch_size, gamma, memory, optimizer):
BATCH_SIZE = batch_size
if len(memory) < BATCH_SIZE:
return
# random transition batch is taken from experience replay memory.
transitions = memory.sample(BATCH_SIZE)
batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)
batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))
current_q_values = network.forward(batch_state).gather(1, batch_action.unsqueeze(-1))
max_next_q_values = network.forward(batch_next_state).detach().max(1)[0]
expected_q_values = batch_reward + (gamma * max_next_q_values)
# loss is measured from error between current and newly expected Q values
loss = F.smooth_l1_loss(expected_q_values, current_q_values)
# backpropagation of loss to NN
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss
env = gym.make('CartPole-v0')
env.reset()
network = network()
agent = agent(env, network)
batch_size = 50
episode = 500
T = 200
gamma = 0.95
memory = replay_memory(100)
optimizer = optim.SGD(network.parameters(), 0.001)
l = []
s = []
for e in range (episode):
state = env.reset()
for t in range (T):
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
if done:
reward = -2
transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])
memory.save(transition)
state = next_state
loss = learn(batch_size, gamma, memory, optimizer)
l.append(loss)
if done:
print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
s.append(t)
break
I would rewrite your training algorithm to:
for e in range (episode):
state = env.reset()
done = False
t = 0
while not done:
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])
memory.save(transition)
state = next_state
loss = learn(batch_size, gamma, memory, optimizer)
l.append(loss)
if t < T:
t += 1
else:
done = True
if done:
print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
s.append(t)
break

Adding neurons to Adam optimizer state in Pytorch

I've posted the following to the Pytorch discussion board too. I'm trying to keep the learning-rates per parameter for the already existing parameters when adding more neurons (to existing layers, not new layers) to a network. I’ve written the following class which allows me to add neurons to hidden layers during training:
import torch
import torch.nn as nn
class DQN(nn.Module):
def __init__(self, num_inputs, hidden, num_actions, non_linearity):
super(DQN, self).__init__()
self.num_inputs = num_inputs
self.hidden = hidden
self.num_actions = num_actions
self.non_linearity = non_linearity
self.layers = nn.ModuleList()
self.layers.append(nn.Linear(num_inputs, self.hidden[0]))
previous = self.hidden[0]
for hidden_layer_size in self.hidden[1:]:
self.layers.append(nn.Linear(previous, hidden_layer_size))
previous = hidden_layer_size
self.layers.append(nn.Linear(previous, num_actions))
def forward(self, x):
for i in range(len(self.layers) - 1):
x = self.non_linearity(self.layers[i](x))
return self.layers[-1](x)
def increase_capacity(self, increment):
for i in range(len(self.hidden)):
self.hidden[i] += increment[i]
bias = self.layers[0].bias.data
weight = self.layers[0].weight.data
self.layers[0] = nn.Linear(self.num_inputs, self.hidden[0])
if increment[0]>0:
self.layers[0].weight.data[0:-increment[0],:] = weight
self.layers[0].bias.data[0:-increment[0]] = bias
else:
self.layers[0].weight.data[0:,:] = weight
self.layers[0].weight.data = bias
for i in range(1, len(self.layers) - 1):
bias = self.layers[i].bias.data
weight = self.layers[i].weight.data
self.layers[i] = nn.Linear(self.hidden[i-1], self.hidden[i])
if increment[i] > 0:
if increment[i-1] >0:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:-increment[i-1]] = weight
else:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:] = weight
else:
if increment[i-1] >0:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:-increment[i-1]] = weight
else:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:] = weight
bias = self.layers[-1].bias.data
weight = self.layers[-1].weight.data
self.layers[-1] = nn.Linear(self.hidden[-1], self.num_actions)
if increment[-1] >0:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:-increment[-1]] = weight
else:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:] = weight
def act(self, state, epsilon, mask):
if np.random.rand() > epsilon:
state = torch.tensor([state], dtype=torch.float32, device=device)
mask = torch.tensor([mask], dtype=torch.float32, device=device)
q_values = self.forward(state) + mask
action = q_values.max(1)[1].view(1, 1).item()
else:
action = np.random.randint(self.num_actions)
return action
Now I’ve written a little sanity check (whether it leads to sanity is questionable at this point): a network with 2 layers with both 1 neuron should fail to learn the x-or function, whereas a network where 4 neurons have been added should. If I initialise a new optimiser this indeed works. The optimiser I use is Adam, which keeps track of learning-rates per parameter. I’d like to keep the learning-rates of Adam for the weights and biases that already existed before I add additional neurons. The following is my failed attempt to doing so:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
# Credits to Alvations
def generate_zero():
return random.uniform(0, 49) / 100
def generate_one():
return random.uniform(50, 100) / 100
def generate_xor_XY(num_data_points):
Xs, Ys = [], []
for _ in range(num_data_points):
# xor(0, 0) -> 0
Xs.append([generate_zero(), generate_zero()]); Ys.append([0])
# xor(1, 0) -> 1
Xs.append([generate_one(), generate_zero()]); Ys.append([1])
# xor(0, 1) -> 1
Xs.append([generate_zero(), generate_one()]); Ys.append([1])
# xor(1, 1) -> 0
Xs.append([generate_one(), generate_one()]); Ys.append([0])
return Xs, Ys
# Initialisation
network = DQN(2,[1,1],1,F.relu)
# optimizer = optim.Adam(network.parameters(), amsgrad=False)
optimizer = optim.Adam(network.parameters(), amsgrad=True)
criterion = nn.MSELoss()
# Train 50000 steps to show 1 neuron cannot solve x-or task
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
# Add 5 neurons to first layer
capacity = [4,4]
network.increase_capacity(capacity)
# Uncomment the following line and comment the lines following it for normal initialisation.
# optimizer = optim.Adam(network.parameters(), amsgrad=True)
nw_param = [p for p in network.parameters()]
new_param_group = []
layer_idx = 0
for idx, group in enumerate(optimizer.param_groups):
for idx_p, p in enumerate(group['params']):
# Save previous information
prev_grad = p.grad
old_p = copy.deepcopy(p)
old_state = copy.copy(optimizer.state[p])
old_step = old_state['step']
old_exp_avg = old_state['exp_avg']
old_exp_avg_sq = old_state['exp_avg_sq']
old_max_exp_avg_sq = old_state['max_exp_avg_sq']
# Remove old parameter from state
optimizer.state.pop(p)
# Weights
if p.dim()>1:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.torch.zeros_like(p)
new_exp_avg_sq = torch.torch.zeros_like(p)
new_max_exp_avg_sq = torch.torch.zeros_like(p)
p.grad[0:prev_grad.size(0),0:prev_grad.size(1)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_max_exp_avg_sq
new_param_group.append(p)
# Biases
else:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.zeros_like(p)
new_exp_avg_sq = torch.zeros_like(p)
new_max_exp_avg_sq = torch.zeros_like(p)
p.grad[0:prev_grad.size(0)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0)] = old_max_exp_avg_sq
new_param_group.append(p)
layer_idx += 1
optimizer.param_groups[0]['params'] = new_param_group
print(network)
# Train 50000 steps to show by adding neurons the task can be solved
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
I’m trying to get the same optimizer state, but with additional parameters for the added neurons. This seems like a convoluted way of doing it (and it doesn’t work:p). Does anyone know of an (easier) way to do this or see where I’m going wrong?

DQN TensorFlow code runs out of memory very quickly

I am trying to train a turtle bot simulation using a DQN. Turtle bot is supposed to find a target in a maze. It is fairly simple and it is converging. My problem is that after a couple of runs the training will get extremely slow. It is fast at the beginning but it gets very slow after 50ish runs. I have check the problem, my CPU is not even used 50% but my memory is eaten up and about 98% of memory is occupied. Somewhere in my code I am leaking memory and I think t is in the initialization of my DQN agent. Can you please guide me on what is the problem and how can I fix it.
Thanks a lot.
Here is the training code which is based on DQN with priority buffer:
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import gym_gazebo
import tensorflow as tf
import numpy as np
import time
import random
from random import *
import cv2
from gym import wrappers
from skimage import transform
import datetime
import liveplot
from dqn_agent_withTarget import DQNAgent
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#--------------------------------------------------------------------------------------------------------------------------------------
def render():
render_skip = 0 #Skip first X episodes.
render_interval = 50 #Show render Every Y episodes.
render_episodes = 10 #Show Z episodes every rendering.
if (x%render_interval == 0) and (x != 0) and (x > render_skip):
env.render()
elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
env.render(close=True)
#--------------------------------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':
#------------------------------------------------------------------------
env = gym.make('GazeboCircuit2TurtlebotLidar-v0')
outdir = '/tmp/gazebo_gym_experiments'
env = gym.wrappers.Monitor(env, outdir, force=True)
plotter = liveplot.LivePlot(outdir)
last_time_steps = np.ndarray(0)
start_time = time.time()
total_episodes = 1000
max_steps = 200
highest_reward = 0
gamma = 0.95
num_actions = 3
action_space = [0,1,2]
tf.reset_default_graph() # Reset training graph
myinit = tf.global_variables_initializer()# Initialize training network
#tf.logging.set_verbosity(tf.logging.INFO)
tf.logging.set_verbosity(tf.logging.ERROR)
#------------------------------------------------------------------------
agent = DQNAgent(action_space,"GazeboCircuit2TurtlebotLidar-v0")
agent.exploration = 1
cv2.namedWindow("window", 1)
x_val = np.random.rand(4096,256).astype(np.float32)
agent.W_fc1.load(x_val, session=agent.sess)
for e in range(total_episodes):
# reset
linecount = 0
terminal= False
win = 0
frame = 0
loss = 0.0
Q_max = 0.0
steps = 0
reward_t= 0.0
env.reset()
cumulated_rewards = 0
agent.exploration *= 0.9
if agent.exploration<0.1:
agent.exploration=0.1
_, reward, terminal, info = env.step(0)
linecount += 1
print( "Time %s, %s" %(linecount,datetime.datetime.now()))
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
linecount += 1
print( "Time %s, %s" %(linecount,datetime.datetime.now()))
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
while (not terminal):
steps += 1
state_t = state_t_1
# execute action in environment
action_t = agent.select_action(state_t, agent.exploration)
_, reward_t, terminal, info = env.step(action_t)
#print("step: ", steps, "action: ",action_t ,"reward: ", reward_t)
print(action_t , end="")
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
# store experience
agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal)
# experience replay
agent.experience_replay()
#print(agent.sess.run(agent.W_fc1))
# for log
frame += 1
loss += agent.current_loss
Q_max += np.max(agent.Q_values(state_t))
cumulated_rewards += reward_t
print(" ")
print("episodes:",e," steps:",steps," loss:",'{0:.2f}'.format(loss/(steps+1)), " terminal:",terminal, " exploration_factor:",agent.exploration , " reward:", '{0:.2f}'.format(cumulated_rewards))
plotter.plot(env)
#print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
# e, total_episodes - 1, cumulated_rewards, loss / frame, Q_max / frame))
env._flush(force=True)
# save model
weights=agent.sess.run(agent.W_fc1)
print(weights)
weights_tmp = cv2.resize(weights, (256,256), interpolation=cv2.INTER_NEAREST)
weights_image = tf.image.convert_image_dtype(weights_tmp, dtype=tf.float32)
cv2.imshow("window",agent.sess.run(weights_image))
cv2.waitKey(1)
# save model
agent.save_model()
env.close()
And here is the DQN agent code: (I think the problem is in initilizer of DQN agent code)
from collections import deque
import os
import numpy as np
import tensorflow as tf
class DQNAgent:
"""
Multi Layer Perceptron with Experience Replay
"""
def __init__(self, enable_actions, environment_name):
# parameters
self.name = os.path.splitext(os.path.basename(__file__))[0]
self.environment_name = environment_name
self.enable_actions = enable_actions
self.n_actions = len(self.enable_actions)
self.minibatch_size = 64
self.replay_memory_size = 1000
self.learning_rate = 0.001
self.discount_factor = 0.9
self.exploration = 1.0
self.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
self.model_name = "{}.ckpt".format(self.environment_name)
# replay memory
self.D = deque(maxlen=self.replay_memory_size)
# model
self.init_model()
# variables
self.current_loss = 0.0
def init_model(self):
#policy##################################################################################
# input layer (32 x 32 x 4)
self.x = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1 = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1 = tf.Variable(tf.zeros([4]))
self.c_cv1 = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1 = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1 = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1 = tf.Variable(tf.zeros([256]))
self.h_fc1 = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2 = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2 = tf.Variable(tf.zeros([32]))
self.h_fc2 = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out = tf.Variable(tf.zeros([self.n_actions]))
self.y = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y_ = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training = optimizer.minimize(self.loss)
#target######################################################################################
# input layer (32 x 32 x 4)
self.x_t = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1_t = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1_t = tf.Variable(tf.zeros([4]))
self.c_cv1_t = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1_t = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat_t = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1_t = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1_t = tf.Variable(tf.zeros([256]))
self.h_fc1_t = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2_t = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2_t = tf.Variable(tf.zeros([32]))
self.h_fc2_t = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out_t = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out_t = tf.Variable(tf.zeros([self.n_actions]))
self.y_t = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y__t = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss_t = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer_t = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training_t = optimizer.minimize(self.loss)
#general################################################################################
# saver
self.saver = tf.train.Saver()
# session
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def Q_values(self, state):
# Q(state, action) of all actions
#print("QQQ VALUES______________________________________________",self.sess.run(state))
x_tmp = self.sess.run(state)
return self.sess.run(self.y, feed_dict={self.x: x_tmp})#[0]
def select_action(self, state, epsilon):
if np.random.rand() <= epsilon:
# random
return np.random.choice(self.enable_actions)
else:
# max_action Q(state, action)
#print("G" , end="")
return self.enable_actions[np.argmax(self.Q_values(state))]
def store_experience(self, state, action, reward, state_1, terminal):
self.D.append((state, action, reward, state_1, terminal))
def experience_replay(self):
state_minibatch = []
y_minibatch = []
# sample random minibatch
minibatch_size = min(len(self.D), self.minibatch_size)
minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size)
for j in minibatch_indexes:
state_j, action_j, reward_j, state_j_1, terminal = self.D[j]
action_j_index = self.enable_actions.index(action_j)
y_j = self.Q_values(state_j)[0]
if terminal:
y_j[action_j_index] = reward_j
else:
# reward_j + gamma * max_action' Q(state', action')
y_j[action_j_index] = reward_j + self.discount_factor * np.max(self.Q_values(state_j_1)) # NOQA
x_tmp = self.sess.run(state_j)
y_j=np.reshape(y_j,(1,3))
state_minibatch.append(x_tmp[0])
y_minibatch.append(y_j[0])
# training
self.sess.run(self.training, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
# for log
self.current_loss = self.sess.run(self.loss, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
def load_model(self, model_path=None):
if model_path:
# load from model_path
self.saver.restore(self.sess, model_path)
else:
# load from checkpoint
checkpoint = tf.train.get_checkpoint_state(self.model_dir)
if checkpoint and checkpoint.model_checkpoint_path:
self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
def save_model(self):
self.saver.save(self.sess, os.path.join(self.model_dir, self.model_name))
Thanks for you help.

Categories

Resources