DQN TensorFlow code runs out of memory very quickly - python

I am trying to train a turtle bot simulation using a DQN. Turtle bot is supposed to find a target in a maze. It is fairly simple and it is converging. My problem is that after a couple of runs the training will get extremely slow. It is fast at the beginning but it gets very slow after 50ish runs. I have check the problem, my CPU is not even used 50% but my memory is eaten up and about 98% of memory is occupied. Somewhere in my code I am leaking memory and I think t is in the initialization of my DQN agent. Can you please guide me on what is the problem and how can I fix it.
Thanks a lot.
Here is the training code which is based on DQN with priority buffer:
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import gym_gazebo
import tensorflow as tf
import numpy as np
import time
import random
from random import *
import cv2
from gym import wrappers
from skimage import transform
import datetime
import liveplot
from dqn_agent_withTarget import DQNAgent
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#--------------------------------------------------------------------------------------------------------------------------------------
def render():
render_skip = 0 #Skip first X episodes.
render_interval = 50 #Show render Every Y episodes.
render_episodes = 10 #Show Z episodes every rendering.
if (x%render_interval == 0) and (x != 0) and (x > render_skip):
env.render()
elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
env.render(close=True)
#--------------------------------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':
#------------------------------------------------------------------------
env = gym.make('GazeboCircuit2TurtlebotLidar-v0')
outdir = '/tmp/gazebo_gym_experiments'
env = gym.wrappers.Monitor(env, outdir, force=True)
plotter = liveplot.LivePlot(outdir)
last_time_steps = np.ndarray(0)
start_time = time.time()
total_episodes = 1000
max_steps = 200
highest_reward = 0
gamma = 0.95
num_actions = 3
action_space = [0,1,2]
tf.reset_default_graph() # Reset training graph
myinit = tf.global_variables_initializer()# Initialize training network
#tf.logging.set_verbosity(tf.logging.INFO)
tf.logging.set_verbosity(tf.logging.ERROR)
#------------------------------------------------------------------------
agent = DQNAgent(action_space,"GazeboCircuit2TurtlebotLidar-v0")
agent.exploration = 1
cv2.namedWindow("window", 1)
x_val = np.random.rand(4096,256).astype(np.float32)
agent.W_fc1.load(x_val, session=agent.sess)
for e in range(total_episodes):
# reset
linecount = 0
terminal= False
win = 0
frame = 0
loss = 0.0
Q_max = 0.0
steps = 0
reward_t= 0.0
env.reset()
cumulated_rewards = 0
agent.exploration *= 0.9
if agent.exploration<0.1:
agent.exploration=0.1
_, reward, terminal, info = env.step(0)
linecount += 1
print( "Time %s, %s" %(linecount,datetime.datetime.now()))
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
linecount += 1
print( "Time %s, %s" %(linecount,datetime.datetime.now()))
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
while (not terminal):
steps += 1
state_t = state_t_1
# execute action in environment
action_t = agent.select_action(state_t, agent.exploration)
_, reward_t, terminal, info = env.step(action_t)
#print("step: ", steps, "action: ",action_t ,"reward: ", reward_t)
print(action_t , end="")
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
# store experience
agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal)
# experience replay
agent.experience_replay()
#print(agent.sess.run(agent.W_fc1))
# for log
frame += 1
loss += agent.current_loss
Q_max += np.max(agent.Q_values(state_t))
cumulated_rewards += reward_t
print(" ")
print("episodes:",e," steps:",steps," loss:",'{0:.2f}'.format(loss/(steps+1)), " terminal:",terminal, " exploration_factor:",agent.exploration , " reward:", '{0:.2f}'.format(cumulated_rewards))
plotter.plot(env)
#print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
# e, total_episodes - 1, cumulated_rewards, loss / frame, Q_max / frame))
env._flush(force=True)
# save model
weights=agent.sess.run(agent.W_fc1)
print(weights)
weights_tmp = cv2.resize(weights, (256,256), interpolation=cv2.INTER_NEAREST)
weights_image = tf.image.convert_image_dtype(weights_tmp, dtype=tf.float32)
cv2.imshow("window",agent.sess.run(weights_image))
cv2.waitKey(1)
# save model
agent.save_model()
env.close()
And here is the DQN agent code: (I think the problem is in initilizer of DQN agent code)
from collections import deque
import os
import numpy as np
import tensorflow as tf
class DQNAgent:
"""
Multi Layer Perceptron with Experience Replay
"""
def __init__(self, enable_actions, environment_name):
# parameters
self.name = os.path.splitext(os.path.basename(__file__))[0]
self.environment_name = environment_name
self.enable_actions = enable_actions
self.n_actions = len(self.enable_actions)
self.minibatch_size = 64
self.replay_memory_size = 1000
self.learning_rate = 0.001
self.discount_factor = 0.9
self.exploration = 1.0
self.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
self.model_name = "{}.ckpt".format(self.environment_name)
# replay memory
self.D = deque(maxlen=self.replay_memory_size)
# model
self.init_model()
# variables
self.current_loss = 0.0
def init_model(self):
#policy##################################################################################
# input layer (32 x 32 x 4)
self.x = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1 = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1 = tf.Variable(tf.zeros([4]))
self.c_cv1 = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1 = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1 = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1 = tf.Variable(tf.zeros([256]))
self.h_fc1 = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2 = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2 = tf.Variable(tf.zeros([32]))
self.h_fc2 = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out = tf.Variable(tf.zeros([self.n_actions]))
self.y = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y_ = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training = optimizer.minimize(self.loss)
#target######################################################################################
# input layer (32 x 32 x 4)
self.x_t = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1_t = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1_t = tf.Variable(tf.zeros([4]))
self.c_cv1_t = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1_t = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat_t = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1_t = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1_t = tf.Variable(tf.zeros([256]))
self.h_fc1_t = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2_t = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2_t = tf.Variable(tf.zeros([32]))
self.h_fc2_t = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out_t = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out_t = tf.Variable(tf.zeros([self.n_actions]))
self.y_t = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y__t = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss_t = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer_t = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training_t = optimizer.minimize(self.loss)
#general################################################################################
# saver
self.saver = tf.train.Saver()
# session
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def Q_values(self, state):
# Q(state, action) of all actions
#print("QQQ VALUES______________________________________________",self.sess.run(state))
x_tmp = self.sess.run(state)
return self.sess.run(self.y, feed_dict={self.x: x_tmp})#[0]
def select_action(self, state, epsilon):
if np.random.rand() <= epsilon:
# random
return np.random.choice(self.enable_actions)
else:
# max_action Q(state, action)
#print("G" , end="")
return self.enable_actions[np.argmax(self.Q_values(state))]
def store_experience(self, state, action, reward, state_1, terminal):
self.D.append((state, action, reward, state_1, terminal))
def experience_replay(self):
state_minibatch = []
y_minibatch = []
# sample random minibatch
minibatch_size = min(len(self.D), self.minibatch_size)
minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size)
for j in minibatch_indexes:
state_j, action_j, reward_j, state_j_1, terminal = self.D[j]
action_j_index = self.enable_actions.index(action_j)
y_j = self.Q_values(state_j)[0]
if terminal:
y_j[action_j_index] = reward_j
else:
# reward_j + gamma * max_action' Q(state', action')
y_j[action_j_index] = reward_j + self.discount_factor * np.max(self.Q_values(state_j_1)) # NOQA
x_tmp = self.sess.run(state_j)
y_j=np.reshape(y_j,(1,3))
state_minibatch.append(x_tmp[0])
y_minibatch.append(y_j[0])
# training
self.sess.run(self.training, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
# for log
self.current_loss = self.sess.run(self.loss, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
def load_model(self, model_path=None):
if model_path:
# load from model_path
self.saver.restore(self.sess, model_path)
else:
# load from checkpoint
checkpoint = tf.train.get_checkpoint_state(self.model_dir)
if checkpoint and checkpoint.model_checkpoint_path:
self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
def save_model(self):
self.saver.save(self.sess, os.path.join(self.model_dir, self.model_name))
Thanks for you help.

Related

NameError: name 'self' is not defined when running code in Python 3.7 for Carla Vehicle Simulator

I've tried to run this code on Carla's command prompt window but it generates the error NameError: name self is not defined here:
self.model.compile(optimizer='adam', loss=self.conditional_loss,
metrics=[self.conditional_loss,
keras.metrics.sparse_categorical_accuracy],
run_eagerly=True)
But if i remove this then it would generate a warning saying that i should either run this in eagerly mode or graph mode with Keras's model.fit embedded into the code to continue to train the model. Any help would be fine, thanks!
Full Code for reference:
import glob
import os
import sys
import random
import time
import numpy as np
import cv2
import math
from collections import deque
from keras.applications.xception import Xception
from keras.layers import Dense, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import TensorBoard
import tensorflow as tf
import keras.backend.tensorflow_backend as backend
import tensorflow.compat.v1 as tf
from threading import Thread
from tqdm import tqdm
try:
sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % (
sys.version_info.major,
sys.version_info.minor,
'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0])
except IndexError:
pass
import carla
SHOW_PREVIEW = False
IM_WIDTH = 640
IM_HEIGHT = 480
SECONDS_PER_EPISODE = 10
REPLAY_MEMORY_SIZE = 5_000
MIN_REPLAY_MEMORY_SIZE = 1_000
MINIBATCH_SIZE = 16
PREDICTION_BATCH_SIZE = 1
TRAINING_BATCH_SIZE = MINIBATCH_SIZE // 4
UPDATE_TARGET_EVERY = 5
MODEL_NAME = "Xception"
MEMORY_FRACTION = 0.4
MIN_REWARD = -200
EPISODES = 100
DISCOUNT = 0.99
epsilon = 1
EPSILON_DECAY = 0.95 ## 0.9975 99975
MIN_EPSILON = 0.001
AGGREGATE_STATS_EVERY = 10
# Own Tensorboard class
class ModifiedTensorBoard(TensorBoard):
# Overriding init to set initial step and writer (we want one log file for all .fit() calls)
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.step = 1
# Overriding this method to stop creating default log writer
def set_model(self, model):
pass
# Overrided, saves logs with our step number
# (otherwise every .fit() will start writing from 0th step)
def on_epoch_end(self, epoch, logs=None):
self.update_stats(**logs)
# Overrided
# We train for one batch only, no need to save anything at epoch end
def on_batch_end(self, batch, logs=None):
pass
# Overrided, so won't close writer
def on_train_end(self, _):
pass
# Custom method for saving own metrics
# Creates writer, writes custom metrics and closes writer
def update_stats(self, **stats):
self._write_logs(stats, self.step)
def _write_logs(self, logs, index):
with self.writer.as_default():
for name, value in logs.items():
tf.summary.scalar(name, value, step=index)
self.step += 1
self.writer.flush()
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10)
])
predictions = model(x_train[:1]).numpy()
predictions
tf.nn.softmax(predictions).numpy()
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss_fn(y_train[:1], predictions).numpy()
model.compile(optimizer='adam',
loss=loss_fn,
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5)
self.model.compile(optimizer='adam', loss=self.conditional_loss,
metrics=[self.conditional_loss,
keras.metrics.sparse_categorical_accuracy],
run_eagerly=True
)
class CarEnv:
SHOW_CAM = SHOW_PREVIEW
STEER_AMT = 1.0
im_width = IM_WIDTH
im_height = IM_HEIGHT
front_camera = None
def __init__(self):
self.client = carla.Client("localhost", 2000)
self.client.set_timeout(2.0)
self.world = self.client.get_world()
self.blueprint_library = self.world.get_blueprint_library()
self.model_3 = self.blueprint_library.filter("model3")[0]
def reset(self):
self.collision_hist = []
self.actor_list = []
self.transform = random.choice(self.world.get_map().get_spawn_points())
self.vehicle = self.world.spawn_actor(self.model_3, self.transform)
self.actor_list.append(self.vehicle)
self.rgb_cam = self.blueprint_library.find('sensor.camera.rgb')
self.rgb_cam.set_attribute("image_size_x", f"{self.im_width}")
self.rgb_cam.set_attribute("image_size_y", f"{self.im_height}")
self.rgb_cam.set_attribute("fov", f"110")
transform = carla.Transform(carla.Location(x=2.5, z=0.7))
self.sensor = self.world.spawn_actor(self.rgb_cam, transform, attach_to=self.vehicle)
self.actor_list.append(self.sensor)
self.sensor.listen(lambda data: self.process_img(data))
self.vehicle.apply_control(carla.VehicleControl(throttle=0.0, brake=0.0))
time.sleep(4)
colsensor = self.blueprint_library.find("sensor.other.collision")
self.colsensor = self.world.spawn_actor(colsensor, transform, attach_to=self.vehicle)
self.actor_list.append(self.colsensor)
self.colsensor.listen(lambda event: self.collision_data(event))
while self.front_camera is None:
time.sleep(0.01)
self.episode_start = time.time()
self.vehicle.apply_control(carla.VehicleControl(throttle=0.0, brake=0.0))
return self.front_camera
def collision_data(self, event):
self.collision_hist.append(event)
def process_img(self, image):
i = np.array(image.raw_data)
#print(i.shape)
i2 = i.reshape((self.im_height, self.im_width, 4))
i3 = i2[:, :, :3]
if self.SHOW_CAM:
cv2.imshow("", i3)
cv2.waitKey(1)
self.front_camera = i3
def step(self, action):
if action == 0:
self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer=-1*self.STEER_AMT))
elif action == 1:
self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer= 0))
elif action == 2:
self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer=1*self.STEER_AMT))
v = self.vehicle.get_velocity()
kmh = int(3.6 * math.sqrt(v.x**2 + v.y**2 + v.z**2))
if len(self.collision_hist) != 0:
done = True
reward = -200
elif kmh < 50:
done = False
reward = -1
else:
done = False
reward = 1
if self.episode_start + SECONDS_PER_EPISODE < time.time():
done = True
return self.front_camera, reward, done, None
class DQNAgent:
def __init__(self):
self.model = self.create_model()
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
self.tensorboard = ModifiedTensorBoard(log_dir=f"logs/{MODEL_NAME}-{int(time.time())}")
self.target_update_counter = 0
self.graph = tf.get_default_graph()
self.terminate = False
self.last_logged_episode = 0
self.training_initialized = False
def create_model(self):
base_model = Xception(weights=None, include_top=False, input_shape=(IM_HEIGHT, IM_WIDTH,3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
predictions = Dense(3, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=predictions)
model.compile(loss="mse", optimizer=Adam(lr=0.001), metrics=["accuracy"])
return model
def update_replay_memory(self, transition):
# transition = (current_state, action, reward, new_state, done)
self.replay_memory.append(transition)
def train(self):
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch])/255
with self.graph.as_default():
current_qs_list = self.model.predict(current_states, PREDICTION_BATCH_SIZE)
new_current_states = np.array([transition[3] for transition in minibatch])/255
with self.graph.as_default():
future_qs_list = self.target_model.predict(new_current_states, PREDICTION_BATCH_SIZE)
X = []
y = []
for index, (current_state, action, reward, new_state, done) in enumerate(minibatch):
if not done:
max_future_q = np.max(future_qs_list[index])
new_q = reward + DISCOUNT * max_future_q
else:
new_q = reward
current_qs = current_qs_list[index]
current_qs[action] = new_q
X.append(current_state)
y.append(current_qs)
log_this_step = False
if self.tensorboard.step > self.last_logged_episode:
log_this_step = True
self.last_log_episode = self.tensorboard.step
with self.graph.as_default():
self.model.fit(np.array(X)/255, np.array(y), batch_size=TRAINING_BATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if log_this_step else None)
if log_this_step:
self.target_update_counter += 1
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
def get_qs(self, state):
return self.model.predict(np.array(state).reshape(-1, *state.shape)/255)[0]
def train_in_loop(self):
X = np.random.uniform(size=(1, IM_HEIGHT, IM_WIDTH, 3)).astype(np.float32)
y = np.random.uniform(size=(1, 3)).astype(np.float32)
with self.graph.as_default():
self.model.fit(X,y, verbose=False, batch_size=1)
self.training_initialized = True
while True:
if self.terminate:
return
self.train()
time.sleep(0.01)
if __name__ == '__main__':
FPS = 60
# For stats
ep_rewards = [-200]
# For more repetitive results
random.seed(1)
np.random.seed(1)
tf.set_random_seed(1)
# Memory fraction, used mostly when training multiple agents
#backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)))
#tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction)
# Create models folder
if not os.path.isdir('models'):
os.makedirs('models')
# Create agent and environment
agent = DQNAgent()
env = CarEnv()
# Start training thread and wait for training to be initialized
trainer_thread = Thread(target=agent.train_in_loop, daemon=True)
trainer_thread.start()
while not agent.training_initialized:
time.sleep(0.01)
# Initialize predictions - forst prediction takes longer as of initialization that has to be done
# It's better to do a first prediction then before we start iterating over episode steps
agent.get_qs(np.ones((env.im_height, env.im_width, 3)))
# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
#try:
env.collision_hist = []
# Update tensorboard step every episode
agent.tensorboard.step = episode
# Restarting episode - reset episode reward and step number
episode_reward = 0
step = 1
# Reset environment and get initial state
current_state = env.reset()
# Reset flag and start iterating until episode ends
done = False
episode_start = time.time()
# Play for given number of seconds only
while True:
# This part stays mostly the same, the change is to query a model for Q values
if np.random.random() > epsilon:
# Get action from Q table
action = np.argmax(agent.get_qs(current_state))
else:
# Get random action
action = np.random.randint(0, 3)
# This takes no time, so we add a delay matching 60 FPS (prediction above takes longer)
time.sleep(1/FPS)
new_state, reward, done, _ = env.step(action)
# Transform new continous state to new discrete state and count reward
episode_reward += reward
# Every step we update replay memory
agent.update_replay_memory((current_state, action, reward, new_state, done))
current_state = new_state
step += 1
if done:
break
# End of episode - destroy agents
for actor in env.actor_list:
actor.destroy()
# Append episode reward to a list and log stats (every given number of episodes)
ep_rewards.append(episode_reward)
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
# Save model, but only when min reward is greater or equal a set value
if min_reward >= MIN_REWARD:
agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
# Decay epsilon
if epsilon > MIN_EPSILON:
epsilon *= EPSILON_DECAY
epsilon = max(MIN_EPSILON, epsilon)
# Set termination flag for training thread and wait for it to finish
agent.terminate = True
trainer_thread.join()
agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

How can I save DDPG model?

I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
# initialize the variance for OU process for exploring policies
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
# compute TD error i.e actual - predicted values
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
# train the critic network with adam optimizer
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
# saver
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
def build_actor_network(self, s, scope, trainable):
# Actor DPG
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 150:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break
I solved this problem completely by rewriting the code and adding the learning function in a separate session

IndexError: index 82459 is out of bounds for axis 0 with size 82459

I am trying to run code (found here) for a visual question generation model. I am running the code using Windows Subsystem for Linux, in an Anaconda virtual environment for Python 2.7. I am using Tensorflow v1.3.0, as I experienced issues using more recent versions of Tensorflow -- the repository is relatively old.
I am receiving the following error (full traceback included):
Traceback (most recent call last):
File "main.py", line 70, in <module>
tf.app.run()
File "/home/username/anaconda2/envs/py27/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "main.py", line 64, in main
model.train()
File "/home/username/VQG-tensorflow/question_generator.py", line 124, in train
feats = self.img_feature[img_list,:]
IndexError: index 82459 is out of bounds for axis 0 with size 82459
I've included the source code for main.py and question_generator.py below. Obviously, the program is trying to access an index that doesn't exist. I can't figure out what would make it behave this way. Similar questions to this one (like this and this) were not helpful. I tried padding the array using the numpy.pad method, but that only led to a different and related error:
ValueError: Cannot feed value of shape (256, 4097) for Tensor u'Placeholder:0', which has shape '(256, 4096)'
Any and all help is greatly appreciated!
Source code for main.py:
#-*- coding: utf-8 -*-
import math
import os
import tensorflow as tf
import numpy as np
import cPickle
import skimage
import pprint
import tensorflow.python.platform
from keras.preprocessing import sequence
from data_loader import *
import vgg19
import question_generator
flags = tf.app.flags
pp = pprint.PrettyPrinter().pprint
tf.app.flags.DEFINE_string('input_img_h5', './data_img.h5', 'path to the h5file containing the image feature')
tf.app.flags.DEFINE_string('input_ques_h5', './data_prepro.h5', 'path to the h5file containing the preprocessed dataset')
tf.app.flags.DEFINE_string('input_json', './data_prepro.json', 'path to the json file containing additional info and vocab')
tf.app.flags.DEFINE_string('model_path', './models/', 'where should we save')
tf.app.flags.DEFINE_string('vgg_path', './vgg16.tfmodel', 'momentum for adam')
tf.app.flags.DEFINE_string('gpu_fraction', '2/3', 'define the gpu fraction used')
tf.app.flags.DEFINE_string('test_image_path', './assets/demo.jpg', 'the image you want to generate question')
tf.app.flags.DEFINE_string('test_model_path', './models/model-250', 'model we saved')
tf.app.flags.DEFINE_integer('batch_size', 256, 'tch_size for each iterations')
tf.app.flags.DEFINE_integer('dim_embed', 512, 'word embedding size')
tf.app.flags.DEFINE_integer('dim_hidden', 512, 'hidden size')
tf.app.flags.DEFINE_integer('dim_image', 4096, 'dimension of output from fc7')
tf.app.flags.DEFINE_integer('img_norm', 1, 'do normalization on image or not')
tf.app.flags.DEFINE_integer('maxlen', 26, 'max length of question')
tf.app.flags.DEFINE_integer('n_epochs', 250, 'how many epochs are we going to train')
tf.app.flags.DEFINE_float('learning_rate', '0.001', 'learning rate for adam')
tf.app.flags.DEFINE_float('momentum', 0.9, 'momentum for adam')
tf.app.flags.DEFINE_boolean('is_train', 'True', 'momentum for adam')
conf = flags.FLAGS
def calc_gpu_fraction(fraction_string):
idx, num = fraction_string.split('/')
idx, num = float(idx), float(num)
fraction = 1 / (num - idx + 1)
print " [*] GPU : %.4f" % fraction
return fraction
def main(_):
attrs = conf.__dict__['__flags']
pp(attrs)
dataset, img_feature, train_data = get_data(conf.input_json, conf.input_img_h5, conf.input_ques_h5, conf.img_norm)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=calc_gpu_fraction(conf.gpu_fraction))
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
model = question_generator.Question_Generator(sess, conf, dataset, img_feature, train_data)
if conf.is_train:
model.build_model()
model.train()
else:
model.build_generator()
model.test(test_image_path=conf.test_image_path, model_path=conf.test_model_path, maxlen=26)
if __name__ == '__main__':
tf.app.run()
Source code for question_generation.py:
import os
import tensorflow as tf
import numpy as np
import tensorflow.python.platform
from keras.preprocessing import sequence
from data_loader import *
import vgg19
tf.pack = tf.stack
tf.select = tf.where
tf.batch_matmul = tf.matmul
class Question_Generator():
def __init__(self, sess, conf, dataset, img_feature, train_data):
self.sess = sess
self.dataset = dataset
self.img_feature = img_feature
self.train_data = train_data
self.dim_image = conf.dim_image
self.dim_embed = conf.dim_embed
self.dim_hidden = conf.dim_hidden
self.batch_size = conf.batch_size
self.maxlen = conf.maxlen
self.n_lstm_steps = conf.maxlen+2
self.model_path = conf.model_path
if conf.is_train:
self.n_epochs = conf.n_epochs
self.learning_rate = conf.learning_rate
self.num_train = train_data['question'].shape[0] # total number of data
self.n_words = len(dataset['ix_to_word'].keys()) # vocabulary_size
# word embedding
self.Wemb = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='Wemb')
self.bemb = tf.Variable(tf.random_uniform([self.dim_embed], -0.1, 0.1), name='bemb')
# LSTM
self.lstm = tf.contrib.rnn.BasicLSTMCell(self.dim_hidden)
#self.lstm = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
# fc7 encoder
self.encode_img_W = tf.Variable(tf.random_uniform([self.dim_image, self.dim_hidden], -0.1, 0.1), name='encode_img_W')
self.encode_img_b = tf.Variable(tf.random_uniform([self.dim_hidden], -0.1, 0.1), name='encode_img_b')
# feat -> word
self.embed_word_W = tf.Variable(tf.random_uniform([self.dim_hidden, self.n_words], -0.1, 0.1), name='embed_word_W')
self.embed_word_b = tf.Variable(tf.random_uniform([self.n_words], -0.1, 0.1), name='embed_word_b')
def build_model(self):
self.image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image])
self.question = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
self.mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
image_emb = tf.nn.xw_plus_b(self.image, self.encode_img_W, self.encode_img_b) # (batch_size, dim_hidden)
state = self.lstm.zero_state(self.batch_size,tf.float32)
loss = 0.0
with tf.variable_scope("RNN"):
for i in range(self.n_lstm_steps):
if i == 0:
current_emb = image_emb
else:
tf.get_variable_scope().reuse_variables()
current_emb = tf.nn.embedding_lookup(self.Wemb, self.question[:,i-1]) + self.bemb
# LSTM
output, state = self.lstm(current_emb, state)
if i > 0:
# ground truth
labels = tf.expand_dims(self.question[:, i], 1)
indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
concated = tf.concat([indices, labels], 1)
#concated = tf.concat(1, [indices, labels])
onehot_labels = tf.sparse_to_dense(
concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0)
# predict word
logit_words = tf.nn.xw_plus_b(output, self.embed_word_W, self.embed_word_b)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=onehot_labels)
cross_entropy = cross_entropy * self.mask[:,i]
current_loss = tf.reduce_sum(cross_entropy)
loss = loss + current_loss
self.loss = loss / tf.reduce_sum(self.mask[:,1:])
def build_generator(self):
self.image = tf.placeholder(tf.float32, [1, self.dim_image]) # only one image
image_emb = tf.nn.xw_plus_b(self.image, self.encode_img_W, self.encode_img_b)
state = tf.zeros([1, self.lstm.state_size])
self.generated_words = []
with tf.variable_scope("RNN"):
output, state = self.lstm(image_emb, state)
last_word = tf.nn.embedding_lookup(self.Wemb, [0]) + self.bemb
for i in range(self.maxlen):
tf.get_variable_scope().reuse_variables()
output, state = self.lstm(last_word, state)
logit_words = tf.nn.xw_plus_b(output, self.embed_word_W, self.embed_word_b)
max_prob_word = tf.argmax(logit_words, 1)
last_word = tf.nn.embedding_lookup(self.Wemb, max_prob_word)
last_word += self.bemb
self.generated_words.append(max_prob_word)
def train(self):
index = np.arange(self.num_train)
np.random.shuffle(index)
questions = self.train_data['question'][index,:]
img_list = self.train_data['img_list'][index]
print("img feature length: " + str(len(self.img_feature)))
print("img list: " + str(img_list))
#self.img_feature = np.pad(self.img_feature, (0,1),'constant', constant_values=(0,0)) #pad array to prevent bug
print("img feature length: " + str(len(self.img_feature)))
feats = self.img_feature[img_list,:]
self.saver = tf.train.Saver(max_to_keep=50)
train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
tf.initialize_all_variables().run()
for epoch in range(self.n_epochs):
counter = 0
for start, end in zip( \
range(0, len(feats), self.batch_size),
range(self.batch_size, len(feats), self.batch_size)
):
current_feats = feats[start:end]
current_questions = questions[start:end]
current_question_matrix = sequence.pad_sequences(current_questions, padding='post', maxlen=self.maxlen+1)
current_question_matrix = np.hstack( [np.full( (len(current_question_matrix),1), 0), current_question_matrix] ).astype(int)
current_mask_matrix = np.zeros((current_question_matrix.shape[0], current_question_matrix.shape[1]))
nonzeros = np.array( map(lambda x: (x != 0).sum()+2, current_question_matrix ))
# +2 -> #START# and '.'
for ind, row in enumerate(current_mask_matrix):
row[:nonzeros[ind]] = 1
_, loss_value = self.sess.run([train_op, self.loss], feed_dict={
self.image: current_feats,
self.question : current_question_matrix,
self.mask : current_mask_matrix
})
if np.mod(counter, 100) == 0:
print "Epoch: ", epoch, " batch: ", counter ," Current Cost: ", loss_value
counter = counter + 1
if np.mod(epoch, 25) == 0:
print "Epoch ", epoch, " is done. Saving the model ... "
self.save_model(epoch)
def test(self, test_image_path, model_path, maxlen):
ixtoword = self.dataset['ix_to_word']
images = tf.placeholder("float32", [1, 224, 224, 3])
image_val = read_image(test_image_path)
vgg = vgg19.Vgg19()
with tf.name_scope("content_vgg"):
vgg.build(images)
fc7 = self.sess.run(vgg.relu7, feed_dict={images:image_val})
saver = tf.train.Saver()
saver.restore(self.sess, model_path)
generated_word_index = self.sess.run(self.generated_words, feed_dict={self.image:fc7})
generated_word_index = np.hstack(generated_word_index)
generated_sentence = ''
for x in generated_word_index:
if x==0:
break
word = ixtoword[str(x)]
generated_sentence = generated_sentence + ' ' + word
print ' '
print '--------------------------------------------------------------------------------------------------------'
print generated_sentence
def save_model(self, epoch):
if not os.path.exists(self.model_path):
os.makedirs(self.model_path)
self.saver.save(self.sess, os.path.join(self.model_path, 'model'), global_step=epoch)
This is a really basic problem. What you didn't understand when you were running this code is that arrays (lists in Python) are 0-indexed. If you have a list of length n, then when you try to access the nth element in that list, you will receive an index error.

Adding neurons to Adam optimizer state in Pytorch

I've posted the following to the Pytorch discussion board too. I'm trying to keep the learning-rates per parameter for the already existing parameters when adding more neurons (to existing layers, not new layers) to a network. I’ve written the following class which allows me to add neurons to hidden layers during training:
import torch
import torch.nn as nn
class DQN(nn.Module):
def __init__(self, num_inputs, hidden, num_actions, non_linearity):
super(DQN, self).__init__()
self.num_inputs = num_inputs
self.hidden = hidden
self.num_actions = num_actions
self.non_linearity = non_linearity
self.layers = nn.ModuleList()
self.layers.append(nn.Linear(num_inputs, self.hidden[0]))
previous = self.hidden[0]
for hidden_layer_size in self.hidden[1:]:
self.layers.append(nn.Linear(previous, hidden_layer_size))
previous = hidden_layer_size
self.layers.append(nn.Linear(previous, num_actions))
def forward(self, x):
for i in range(len(self.layers) - 1):
x = self.non_linearity(self.layers[i](x))
return self.layers[-1](x)
def increase_capacity(self, increment):
for i in range(len(self.hidden)):
self.hidden[i] += increment[i]
bias = self.layers[0].bias.data
weight = self.layers[0].weight.data
self.layers[0] = nn.Linear(self.num_inputs, self.hidden[0])
if increment[0]>0:
self.layers[0].weight.data[0:-increment[0],:] = weight
self.layers[0].bias.data[0:-increment[0]] = bias
else:
self.layers[0].weight.data[0:,:] = weight
self.layers[0].weight.data = bias
for i in range(1, len(self.layers) - 1):
bias = self.layers[i].bias.data
weight = self.layers[i].weight.data
self.layers[i] = nn.Linear(self.hidden[i-1], self.hidden[i])
if increment[i] > 0:
if increment[i-1] >0:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:-increment[i-1]] = weight
else:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:] = weight
else:
if increment[i-1] >0:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:-increment[i-1]] = weight
else:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:] = weight
bias = self.layers[-1].bias.data
weight = self.layers[-1].weight.data
self.layers[-1] = nn.Linear(self.hidden[-1], self.num_actions)
if increment[-1] >0:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:-increment[-1]] = weight
else:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:] = weight
def act(self, state, epsilon, mask):
if np.random.rand() > epsilon:
state = torch.tensor([state], dtype=torch.float32, device=device)
mask = torch.tensor([mask], dtype=torch.float32, device=device)
q_values = self.forward(state) + mask
action = q_values.max(1)[1].view(1, 1).item()
else:
action = np.random.randint(self.num_actions)
return action
Now I’ve written a little sanity check (whether it leads to sanity is questionable at this point): a network with 2 layers with both 1 neuron should fail to learn the x-or function, whereas a network where 4 neurons have been added should. If I initialise a new optimiser this indeed works. The optimiser I use is Adam, which keeps track of learning-rates per parameter. I’d like to keep the learning-rates of Adam for the weights and biases that already existed before I add additional neurons. The following is my failed attempt to doing so:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
# Credits to Alvations
def generate_zero():
return random.uniform(0, 49) / 100
def generate_one():
return random.uniform(50, 100) / 100
def generate_xor_XY(num_data_points):
Xs, Ys = [], []
for _ in range(num_data_points):
# xor(0, 0) -> 0
Xs.append([generate_zero(), generate_zero()]); Ys.append([0])
# xor(1, 0) -> 1
Xs.append([generate_one(), generate_zero()]); Ys.append([1])
# xor(0, 1) -> 1
Xs.append([generate_zero(), generate_one()]); Ys.append([1])
# xor(1, 1) -> 0
Xs.append([generate_one(), generate_one()]); Ys.append([0])
return Xs, Ys
# Initialisation
network = DQN(2,[1,1],1,F.relu)
# optimizer = optim.Adam(network.parameters(), amsgrad=False)
optimizer = optim.Adam(network.parameters(), amsgrad=True)
criterion = nn.MSELoss()
# Train 50000 steps to show 1 neuron cannot solve x-or task
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
# Add 5 neurons to first layer
capacity = [4,4]
network.increase_capacity(capacity)
# Uncomment the following line and comment the lines following it for normal initialisation.
# optimizer = optim.Adam(network.parameters(), amsgrad=True)
nw_param = [p for p in network.parameters()]
new_param_group = []
layer_idx = 0
for idx, group in enumerate(optimizer.param_groups):
for idx_p, p in enumerate(group['params']):
# Save previous information
prev_grad = p.grad
old_p = copy.deepcopy(p)
old_state = copy.copy(optimizer.state[p])
old_step = old_state['step']
old_exp_avg = old_state['exp_avg']
old_exp_avg_sq = old_state['exp_avg_sq']
old_max_exp_avg_sq = old_state['max_exp_avg_sq']
# Remove old parameter from state
optimizer.state.pop(p)
# Weights
if p.dim()>1:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.torch.zeros_like(p)
new_exp_avg_sq = torch.torch.zeros_like(p)
new_max_exp_avg_sq = torch.torch.zeros_like(p)
p.grad[0:prev_grad.size(0),0:prev_grad.size(1)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_max_exp_avg_sq
new_param_group.append(p)
# Biases
else:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.zeros_like(p)
new_exp_avg_sq = torch.zeros_like(p)
new_max_exp_avg_sq = torch.zeros_like(p)
p.grad[0:prev_grad.size(0)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0)] = old_max_exp_avg_sq
new_param_group.append(p)
layer_idx += 1
optimizer.param_groups[0]['params'] = new_param_group
print(network)
# Train 50000 steps to show by adding neurons the task can be solved
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
I’m trying to get the same optimizer state, but with additional parameters for the added neurons. This seems like a convoluted way of doing it (and it doesn’t work:p). Does anyone know of an (easier) way to do this or see where I’m going wrong?

Machine Learning reward artificially capping

So when I run this, it works perfectly, however, for some reason the reward caps at 200. I'm not sure what could be causing this. I'm new to machine learning and this is my first project, so sorry if I am missing something stupid.I hypothesize that done is triggering before I want it too, but playing with that hasn't led to anything. Thanks so much.
import gym
import tensorflow as tf
import numpy as np
import os
import sys
env = gym.make('CartPole-v0')
discount_rate=.95
# TODO Build the policy gradient neural network
class Agent:
def __init__(self, num_actions, state_size):
initializer = tf.contrib.layers.xavier_initializer()
self.input_layer = tf.placeholder(dtype=tf.float32, shape=[None, state_size])
# Neural net starts here
hidden_layer = tf.layers.dense(self.input_layer, 8, activation=tf.nn.relu, kernel_initializer=initializer)
hidden_layer_2 = tf.layers.dense(hidden_layer, 8, activation=tf.nn.relu, kernel_initializer=initializer)
# Output of neural net
out = tf.layers.dense(hidden_layer_2, num_actions, activation=None)
self.outputs = tf.nn.softmax(out)
self.choice = tf.argmax(self.outputs, axis=1)
# Training Procedure
self.rewards = tf.placeholder(shape=[None, ], dtype=tf.float32)
self.actions = tf.placeholder(shape=[None, ], dtype=tf.int32)
one_hot_actions = tf.one_hot(self.actions, num_actions)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=one_hot_actions)
self.loss = tf.reduce_mean(cross_entropy * self.rewards)
self.gradients = tf.gradients(self.loss, tf.trainable_variables())
# Create a placeholder list for gradients
self.gradients_to_apply = []
for index, variable in enumerate(tf.trainable_variables()):
gradient_placeholder = tf.placeholder(tf.float32)
self.gradients_to_apply.append(gradient_placeholder)
# Create the operation to update gradients with the gradients placeholder.
optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
self.update_gradients =
optimizer.apply_gradients(zip(self.gradients_to_apply, tf.trainable_variables()))
def discount_normalize_rewards(rewards):
discounted_rewards = np.zeros_like(rewards)
total_rewards = 0
for i in reversed(range(len(rewards))):
total_rewards = total_rewards * discount_rate + rewards[i]
discounted_rewards[i] = total_rewards
discounted_rewards -= np.mean(discounted_rewards)
discounted_rewards /= np.std(discounted_rewards)
return discounted_rewards
#initialize the training loop
tf.reset_default_graph()
# Modify these to match shape of actions and states in your environment
num_actions = 2
state_size = 4
path = "./cartpole-pg/"
training_episodes = 1000
max_steps_per_episode = 20000
episode_batch_size = 5
agent = Agent(num_actions, state_size)
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=2)
if not os.path.exists(path):
os.makedirs(path)
with tf.Session() as sess:
sess.run(init)
total_episode_rewards = []
# Create a buffer of 0'd gradients
gradient_buffer = sess.run(tf.trainable_variables())
for index, gradient in enumerate(gradient_buffer):
gradient_buffer[index] = gradient * 0
for episode in range(training_episodes):
state = env.reset()
episode_history = []
episode_rewards = 0
for step in range(max_steps_per_episode):
if episode % 100 == 0:
env.render()
# Get weights for each action
action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [state]})
action_choice = np.random.choice(range(num_actions), p=action_probabilities[0])
state_next, reward, done, _ = env.step(action_choice)
episode_history.append([state, action_choice, reward, state_next])
state = state_next
episode_rewards += reward
if done:
total_episode_rewards.append(episode_rewards)
episode_history = np.array(episode_history)
episode_history[:,2] = discount_normalize_rewards(episode_history[:,2])
ep_gradients = sess.run(agent.gradients, feed_dict={agent.input_layer: np.vstack(episode_history[:, 0]),
agent.actions: episode_history[:, 1],
agent.rewards: episode_history[:, 2]})
# add the gradients to the grad buffer:
for index, gradient in enumerate(ep_gradients):
gradient_buffer[index] += gradient
break
if episode % episode_batch_size == 0:
feed_dict_gradients = dict(zip(agent.gradients_to_apply, gradient_buffer))
sess.run(agent.update_gradients, feed_dict=feed_dict_gradients)
for index, gradient in enumerate(gradient_buffer):
gradient_buffer[index] = gradient * 0
if episode % 1 == 0:
saver.save(sess, path + "pg-checkpoint", episode)
print("Reward: " + str(total_episode_rewards[-1:]))
env.close()
Episodes for Cartpole terminate when the pole falls and at 200 successful steps. See the max_episode_steps in the linked file if you want to change this. The reason there is a 200 step max is to make evaluating trials easier (ie you always get episode ends so you can evaluate episode stats) and so that the environment doesn't get stuck in a never ending trial.
register(
id='CartPole-v0',
entry_point='gym.envs.classic_control:CartPoleEnv',
max_episode_steps=200,
reward_threshold=195.0,)

Categories

Resources