gym MultiDiscrete space and A2C stable-baseline - python

I am implementing an RL agent based on A2C of stable-baseline3 on a gym environment with MultiDiscrete observation and action spaces.
I get the following error when learning
RuntimeError: Class values must be smaller than num_classes.
This is a typical PyTorch error, but I do not get its origin. I attach my code.
Before the code, I explain the idea. We train a Custom environment where we have several machines (we train first only two machines), needing to decide the production rate of the machines before they break. The action space includes also the decision of scheduling the maintenance in some time distance, and for each machine it decides which machine to be maintained.
Hence, observation space is the consumption state of each machine and the time distance of the scheduled maintenance (it can also be "not scheduled"), whereas action space is production rate for each machine, maintenance decision for each machine and call-to-schedule.
The reward is given when the total production exceeds a threshold, and negative rewards are the costs of maintenance and scheduling.
Now, I know this is a big thing and we need to reduce these spaces, but the actual problem is this error with PyTorch. I do not see where it comes from. A2C deals with both MultiDiscrete space in observation and action, but I do not know the origin of this. We set a A2C algorithm with MlpPolicy and we try to train the policy with this environment.
I attach the code.
from gym import Env
from gym.spaces import MultiDiscrete
import numpy as np
from numpy.random import poisson
import random
from functools import reduce
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Flatten
# from tensorflow.keras.optimizers import Adam
from stable_baselines3 import A2C
from stable_baselines3.common.env_checker import check_env
class MaintenanceEnv(Env):
def __init__(self, max_machine_states_vec, production_rates_vec, production_threshold, scheduling_horizon, operations_horizon = 100):
"""
Returns:
self.action_space is a vector with the maximum production rate fro each machine, a binary call-to-maintenance and a binary call-to-schedule
"""
num_machines = len(max_machine_states_vec)
assert len(max_machine_states_vec) == len(production_rates_vec), "Machine states and production rates have different cardinality"
# Actions we can take, down, stay, up
self.action_space = MultiDiscrete(production_rates_vec + num_machines*[2] + [2]) ### Action space is the production rate from 0 to N and the choice of scheduling
# Temperature array
self.observation_space = MultiDiscrete(max_machine_states_vec + [scheduling_horizon+2]) ### Observation space is the 0,...,L for each machine + the scheduling state including "ns" (None = "ns")
# Set start temp
self.state = num_machines*[0] + [0]
# Set shower length
self.operations_horizon = operations_horizon
self.time_to_finish = operations_horizon
self.scheduling_horizon = scheduling_horizon
self.max_states = max_machine_states_vec
self.production_threshold = production_threshold
def step(self, action):
"""
Notes: Schedule state
"""
num_machines = len(self.max_states)
maintenance_distance_index = -1
reward = 0
done = False
info = {}
### Cost parameters
cost_setup_schedule = 5
cost_preventive_maintenance = 10
cost_corrective_maintenance = 50
reward_excess_on_production = 5
cost_production_deficit = 10
cost_fixed_penalty = 10
failure_reward = -10**6
amount_produced = 0
### Errors
if action[maintenance_distance_index] == 1 and self.state[-1] != self.scheduling_horizon + 1: # Case when you set a reparation scheduled, but it is already scheduled. Not possible.
reward = failure_reward ###It should not be possible
done = True
return self.state, reward, done, info
if self.state[-1] == 0:
for pos in range(num_machines):
if action[num_machines + pos] == 1 and self.state[maintenance_distance_index] > 0: ### Case when maintenance is applied, but schedule is not involved yet. Not possible.
reward = failure_reward ### It should not be possible
done = True
return self.state, reward, done, info
for pos in range(num_machines):
if self.state[pos] == self.max_states[pos] and action[pos] > 0: # Case when machine is broken, but it is producing
reward = failure_reward ### It should not be possible
done = True
return self.state, reward, done, info
if self.state[maintenance_distance_index] == 0:
for pos in range(num_machines):
if action[num_machines+pos] == 1 and action[pos] > 0 : ### Case when it is maintenance time but the machines to be maintained keeps working. Not possible
reward = failure_reward ### It should not be possible
done = True
return self.state, reward, done, info
### State update
for pos in range(num_machines):
if self.state[pos] < self.max_states[pos] and self.state[maintenance_distance_index] > 0: ### The machine is in production, state update includes product amount
# self.state[pos] = min(self.max_states[pos] , self.state[pos] + poisson(action[pos] / self.action_space[pos])) ### Temporary: for I delete from the state the result of a poisson distribution depending on the production rate, Poisson is temporary
self.state[pos] = min(self.max_states[pos] , self.state[pos] + action[pos]) ### Temporary: Consumption rate is deterministic
amount_produced += action[pos]
if amount_produced >= self.production_threshold:
reward += reward_excess_on_production * (amount_produced - self.production_threshold)
else:
reward -= cost_production_deficit * (self.production_threshold - amount_produced)
reward -= cost_fixed_penalty
if action[maintenance_distance_index] == 1 and self.state[maintenance_distance_index] == self.scheduling_horizon + 1: ### You call a schedule when the state is not scheduled
self.state[maintenance_distance_index] = self.scheduling_horizon
reward -= cost_setup_schedule
elif self.state[maintenance_distance_index] > 0 and self.state[maintenance_distance_index] <= self.scheduling_horizon: ### You reduced the distance from scheduled maintenance
self.state[maintenance_distance_index] -= 1
for pos in range(num_machines): ### Case when we are repairing the machines and we need to pay the costs of repairment, and set them as new
if action[num_machines+pos] == 1 :
if self.state[pos] < self.max_states[pos]:
reward -= cost_preventive_maintenance
elif self.state[pos] == self.max_states[pos]:
reward -= cost_corrective_maintenance
self.state[pos] = 0
if self.state[maintenance_distance_index] == 0: ### when maintenance have been performed, reset the scheduling state to "not scheduled"
self.state[maintenance_distance_index] = self.scheduling_horizon + 1
### Time threshold
if self.time_to_finish > 0:
self.time_to_finish -= 1
else:
done = True
# Return step information
return self.state, reward, done, info
def render(self):
# Implement viz
pass
def reset(self):
# Reset shower temperature
num_machines = len(self.max_states)
self.state = np.array(num_machines*[0] + [0])
self.time_to_finish = self.operations_horizon
return self.state
def build_model(states, actions):
model = Sequential()
model.add(Dense(24, activation='relu', input_shape=states)) #
model.add(Dense(24, activation='relu'))
model.add(Dense(actions, activation='linear'))
return model
if __name__ == "__main__":
###GLOBAL COSTANTS AND PARAMETERS
NUMBER_MACHINES = 2
FAILURE_STATE_LIMIT = 8
MAXIMUM_PRODUCTION_RATE = 5
SCHEDULING_HORIZON = 4
PRODUCTION_THRESHOLD = 20
machine_states = NUMBER_MACHINES * [4]
failure_states = NUMBER_MACHINES * [FAILURE_STATE_LIMIT]
production_rates = NUMBER_MACHINES * [MAXIMUM_PRODUCTION_RATE]
### Setting environment
env = MaintenanceEnv(failure_states, production_rates, PRODUCTION_THRESHOLD, SCHEDULING_HORIZON)
model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)
obs = env.reset()
for i in range(1000):
action, _state = model.predict(obs, deterministic=True)
obs, reward, done, info = env.step(action)
# env.render()
if done:
obs = env.reset()
I have the feeling it is due to the MultiDiscrete spaces, but I ask help. Thanks :)

As per the docs https://www.gymlibrary.dev/api/spaces/#discrete, Discrete(n) would have values from 0 to (n-1) by default. We have to note that this does not go up to n.
The MultiDiscrete space is built on top of Discrete and has the same behavior. Hence, if an action leads to a value n, we would get this error. A simple way to solve it is to ensure that the values never exceed the bounds (inclusive of 0 and up to n-1) while taking a step.

Related

OpenAI Gym with Neat experiment not working in Jupyter

import os
import gym
import neat
import neat.config
import configparser
import multiprocessing
import graphviz
import numpy as np
def eval_genome(genome, config):
# Creates 'FeedForward' Network with evolved genome and the requested configuration
network = neat.nn.FeedForwardNetwork.create(genome, config)
fits = []
runs_per_network = 10
for _ in range(runs_per_network):
observations = env.reset()
fitness = 0
for _ in range(500):
action = network.activate(observations) # The obsv is used as the input of the Neural Network
action = int(np.argmax(action)) # for the action, we take the one with the highest probability
observations, reward, done, _ = env.step(action) # We then take the action and get a reward
#reward
fitness += reward
if done:
fits.append(fitness)
break
fits.append(fitness)
# The genome's fitness is its worst performance across all runs per network
return min(fits)
# Evaluation of 100 episodes
def eval_winner_network(winner, config):
# Creates 'FeedForward' Network with evolved genome and the requested configuration
winner_network = neat.nn.FeedForwardNetwork.create(winner, config)
fits = []
for _ in range(100): #number of episodes
observations = env.reset()
fitness = 0
for _ in range(500):
action = winner_network.activate(observations)
action = int(np.argmax(action))
observations, reward, done, _ = env.step(action)
fitness += reward
if done:
fits.append(fitness)
break
fits.append(fitness)
print("Average fitness throughout 100 episodes is observed at: {}".format(np.mean(fits)))
if np.mean(fits) >= 500: #task solved if total reward equates to 500 over 100 consecutive trials
print(" + Task solved + ")
else:
print(" - Task unsolved - ")
# Final product for animation
def viz_winner_network(winner, config):
winner_network = neat.nn.FeedForwardNetwork.create(winner, config)
for _ in range(3):
observations = env.reset()
for _ in range(500): #episode length
env.render()
action = winner_network.activate(observations)
action = int(np.argmax(action))
observations, _, done, _ = env.step(action)
if done:
break
#load CartPole env
env = gym.make('CartPole-v1')
# Loads NEAT configuration file
local_dir = os.path.dirname('C:\\Users\\User')
config_path = os.path.join(local_dir, 'cartpole.config')
config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
neat.DefaultSpeciesSet, neat.DefaultStagnation,
config_path)
population = neat.Population(config)
# Adds statistics report
population.add_reporter(neat.StdOutReporter(True)) #Add a stdout reporter to show progress in terminal
stats = neat.StatisticsReporter()
population.add_reporter(stats)
# Run parallel execution over available processors
pe = neat.ParallelEvaluator(multiprocessing.cpu_count(), eval_genome)
it is at this point here, that running this next line here is when the output gets stuck on. I am unsure if this jupyter related or simply for the fact I have made a mistake somewhere along the way. I have been trying to debug this for 2 days with no avail.
# Runs NEAT and returns the best network
winner = population.run(pe.evaluate)
so, running this code and i am now left with ...
****** Running generation 0 ******
the file continues to run and has been for roughly 1 hour on quite a competent machine.

DQN's Q-loss converaged but it performed poorly

I am trying to code my own DQN in Python, using pytorch. I am trying it on the CartPole environment.
Although the Q-loss converaged, the model performed poorly.
Replay buffer was also used in the model with a size of 2000 and the double networks were also used.I updated the target network when the q_net network was updated 100 times.(ps: q_net was used to decide which action to choose.)
I tried to use different network architectures and different combinations of hyper-parameters, but the model still performed poorly, which just kept swaying and never kept itself balanced.
Appreciate for your help, sincerely!!
Here is the figure of Q-loss result:
Q-loss figure
a closer look
Here is the code for action taking.
def take_action(self,state):
if self.learn_count<=10:
self.eposilon = 0.1
else:
self.eposilon = 0.9
decision = np.random.choice([0,1],p = [1-self.eposilon,self.eposilon])
if decision == 1:
#final_decision = self.q_net(state.cuda().detach()).argmax()
final_decision = self.q_net(torch.Tensor(state).to(self.device))
final_decision = torch.max(final_decision, 0)[1].data.numpy()
else:
final_decision = np.random.choice([i for i in range(self.action_space)])
return final_decision.item()
Here is the code for training the network.
def update_nn(self):
if self.learn_count%100 ==0:
self.synchronous_NN()
for i in range(self.num_epoches):
self.learn_count = self.learn_count+1
self.training_nn()
return None
def training_nn(self):
index = random.sample(range(self.replay_buffer.shape[0]),self.minibatch_size)
chosen_sample = self.replay_buffer[index,:]
last_state = copy.deepcopy(chosen_sample[np.isnan(chosen_sample[:,-1:]).squeeze(),:])
not_last_state = copy.deepcopy(chosen_sample[~np.isnan(chosen_sample[:,-1:]).squeeze(),:])
input_not_last_state = torch.FloatTensor(not_last_state[:,:4]).to(self.device)
action_index = torch.LongTensor(not_last_state[:,4].reshape(-1,1)).to(self.device)
action_value = self.q_net(input_not_last_state).gather(1,action_index)
max_action_value = not_last_state[:,5]+self.gamma*self.fixed_q_net(input_not_last_state).detach().max(1).values.numpy()
last_state = np.nan_to_num(last_state)
input_last_state = torch.FloatTensor(last_state[:,:4]).to(self.device)
last_action_index = torch.LongTensor(last_state[:,4].reshape(-1,1)).to(self.device)
last_action_value = self.q_net(input_last_state).gather(1,last_action_index)
last_max_action_value = last_state[:,5]
X = torch.cat([action_value,last_action_value])
y = torch.FloatTensor(np.hstack([max_action_value,last_max_action_value]).reshape(-1,1)).detach()
loss = self.loss(X, y)
self.optimizer.zero_grad() # reset the gradient to zero
loss.backward()
self.optimizer.step() # execute back propagation for one step
self.loss_curve.append(loss)
return None
here is the part of playing:
def start_to_play(self):
agent = Agent()
agent.initial_Q_network(2)
agent.initial_replay_buffer()
self.env = gym.make('CartPole-v0')
self.env = self.env.unwrapped
for i in range(self.episode):
if i%50 ==1:
agent.save_model(i)
step = 0
state = self.env.reset()
ep_r = 0
while(True):
action = agent.take_action(state)
observation, reward, done, info = self.env.step(action)
self.env.render()
#next_state = self.capture_state()
next_state = observation
x, x_dot, theta, theta_dot = observation
r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8
r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5
reward = r1 + r2
ep_r = reward+ep_r
if done:
reward = reward-20
state1_np = np.array(state)
state2_np = np.array([np.nan,np.nan,np.nan,np.nan])
agent.add_replay_buffer(np.hstack([state1_np,np.array([action,reward]),state2_np]))
if agent.replay_buffer.shape[0]>300:
agent.update_nn()
print(i,step,round(ep_r, 2))
break
else:
state1_np = np.array(state)
state2_np = np.array(next_state)
agent.add_replay_buffer(np.hstack([state1_np,np.array([action,reward]),state2_np]))
if agent.replay_buffer.shape[0]>300:
agent.update_nn()
state = next_state
step = step+1
self.plot_curve(agent)
return None
Thanks for your time!!
Loss in reinforcement learning is not very important. In fact, you could see very good agents with good performances but bad results in term of neural network losses.
I suggest you to look into other implementations hyperparameters, since CartPole has been solved an infinite amount of times with multiple algorithms.
The first thing that might be wrong is the experience replay buffer, which is too small. But then again, look into other DQN implementation for CartPole and try to use the same hyperparameters that they are using

DQN predicts same action value for every state (cart pole)

I'm trying to implement a DQN. As a warm up I want to solve CartPole-v0 with a MLP consisting of two hidden layers along with input and output layers. The input is a 4 element array [cart position, cart velocity, pole angle, pole angular velocity] and output is an action value for each action (left or right). I am not exactly implementing a DQN from the "Playing Atari with DRL" paper (no frame stacking for inputs etc). I also made a few non standard choices like putting done and the target network prediction of action value in the experience replay, but those choices shouldn't affect learning.
In any case I'm having a lot of trouble getting the thing to work. No matter how long I train the agent it keeps predicting a higher value for one action over another, for example Q(s, Right)> Q(s, Left) for all states s. Below is my learning code, my network definition, and some results I get from training
class DQN:
def __init__(self, env, steps_per_episode=200):
self.env = env
self.agent_network = MlpPolicy(self.env)
self.target_network = MlpPolicy(self.env)
self.target_network.load_state_dict(self.agent_network.state_dict())
self.target_network.eval()
self.optimizer = torch.optim.RMSprop(
self.agent_network.parameters(), lr=0.005, momentum=0.95
)
self.replay_memory = ReplayMemory()
self.gamma = 0.99
self.steps_per_episode = steps_per_episode
self.random_policy_stop = 1000
self.start_learning_time = 1000
self.batch_size = 32
def learn(self, episodes):
time = 0
for episode in tqdm(range(episodes)):
state = self.env.reset()
for step in range(self.steps_per_episode):
if time < self.random_policy_stop:
action = self.env.action_space.sample()
else:
action = select_action(self.env, time, state, self.agent_network)
new_state, reward, done, _ = self.env.step(action)
target_value_pred = predict_target_value(
new_state, reward, done, self.target_network, self.gamma
)
experience = Experience(
state, action, reward, new_state, done, target_value_pred
)
self.replay_memory.append(experience)
if time > self.start_learning_time: # learning step
experience_batch = self.replay_memory.sample(self.batch_size)
target_preds = extract_value_predictions(experience_batch)
agent_preds = agent_batch_preds(
experience_batch, self.agent_network
)
loss = torch.square(agent_preds - target_preds).sum()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if time % 1_000 == 0: # how frequently to update target net
self.target_network.load_state_dict(self.agent_network.state_dict())
self.target_network.eval()
state = new_state
time += 1
if done:
break
def agent_batch_preds(experience_batch: list, agent_network: MlpPolicy):
"""
Calculate the agent action value estimates using the old states and the
actual actions that the agent took at that step.
"""
old_states = extract_old_states(experience_batch)
actions = extract_actions(experience_batch)
agent_preds = agent_network(old_states)
experienced_action_values = agent_preds.index_select(1, actions).diag()
return experienced_action_values
def extract_actions(experience_batch: list) -> list:
"""
Extract the list of actions from experience replay batch and torchify
"""
actions = [exp.action for exp in experience_batch]
actions = torch.tensor(actions)
return actions
class MlpPolicy(nn.Module):
"""
This class implements the MLP which will be used as the Q network. I only
intend to solve classic control problems with this.
"""
def __init__(self, env):
super(MlpPolicy, self).__init__()
self.env = env
self.input_dim = self.env.observation_space.shape[0]
self.output_dim = self.env.action_space.n
self.fc1 = nn.Linear(self.input_dim, 32)
self.fc2 = nn.Linear(32, 128)
self.fc3 = nn.Linear(128, 32)
self.fc4 = nn.Linear(32, self.output_dim)
def forward(self, x):
if type(x) != torch.Tensor:
x = torch.tensor(x).float()
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
return x
Learning results:
Here I'm seeing one action always valued over the others (Q(right, s) > Q(left, s)). It's also clear that the network is predicting the same action values for every state.
Does anyone have an idea about what's going on? I've done a lot of debugging and careful reading of the original papers (also thought about "normalizing" the observation space even though the velocities can be infinite) and could be missing something obvious at this point. I can include more code for the helper functions if that would be useful.
There was nothing wrong with the network definition. It turns out the learning rate was too high and reducing it 0.00025 (as in the original Nature paper introducing the DQN) led to an agent which can solve CartPole-v0.
That said, the learning algorithm was incorrect. In particular I was using the wrong target action-value predictions. Note the algorithm laid out above does not use the most recent version of the target network to make predictions. This leads to poor results as training progresses because the agent is learning based on stale target data. The way to fix this is to just put (s, a, r, s', done) into the replay memory and then make target predictions using the most up to date version of the target network when sampling a mini batch. See the code below for an updated learning loop.
def learn(self, episodes):
time = 0
for episode in tqdm(range(episodes)):
state = self.env.reset()
for step in range(self.steps_per_episode):
if time < self.random_policy_stop:
action = self.env.action_space.sample()
else:
action = select_action(self.env, time, state, self.agent_network)
new_state, reward, done, _ = self.env.step(action)
experience = Experience(state, action, reward, new_state, done)
self.replay_memory.append(experience)
if time > self.start_learning_time: # learning step.
experience_batch = self.replay_memory.sample(self.batch_size)
target_preds = target_batch_preds(
experience_batch, self.target_network, self.gamma
)
agent_preds = agent_batch_preds(
experience_batch, self.agent_network
)
loss = torch.square(agent_preds - target_preds).sum()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if time % 1_000 == 0: # how frequently to update target net
self.target_network.load_state_dict(self.agent_network.state_dict())
self.target_network.eval()
state = new_state
time += 1
if done:
break

Reinforcement learning with Super Mario not learning

After experimenting with the Cart Pole from openai gym, I wanted to try something a bit more advanced reinforcement learning and went on with Super Mario Bros NES. Using a similar setup doesn't seem to yield the results I was expecting though, and Mario doesn't seem to learn anything.
I have been using retro gym to emulate the environment and writing some custom wrappers for actions, observations and reward (code for these are at the bottom) and these seem to yield what I was expecting. For reinforcement learning I have been using TensorFlow for the network model and the DQN agent from Keras-RL with an epsilon greedy Q policy, and after many hours of training Mario is still unable to jump high enough to get over the initial obstacles.
Code for reinforcement learning:
import retro
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
from wrapper import wrapper
env = retro.make('SuperMarioBros-Nes', use_restricted_actions=retro.Actions.DISCRETE)
env = wrapper(env)
states = 70
actions = 4
train_size = 5000000
def build_model(states, actions):
model = Sequential()
model.add(Flatten(input_shape=(1,10,7)))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(actions, activation='linear'))
return model
def build_agent(model, actions):
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=train_size, window_length=1)
dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=100000, target_model_update=1e-2)
return dqn
model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=train_size, visualize=True, verbose=1)
dqn.save_weights('weights/MarioWeights.h5f', overwrite=True)
I have tried tuning network architecture and target update rate, different policies, exploration vs. exploitation, observation space, reward and target update rate, but little seems to change and reward converges at very low values after a while.
Code for wrappers:
class ObservationWrapper(gym.ObservationWrapper):
def __init__(self, env):
super().__init__(env)
self.env = env
def observation(self, obs):
### STATE MODIFICATION ###
ram = self.env.get_ram() # Grab ram data
tiles = SMB.get_tiles(ram) # Grab dict with all tiles on the screen
t_list = list(tiles.values()) # Convert to list
ct_list = np.asarray(self.chunkify(t_list)) # Convert to array with image dimensions
try:
m_x = int(np.where(ct_list == 170)[1])
except:
m_x = 3
return ct_list[4:14, m_x:m_x+7] # Crop to ROI
def chunkify(self, list, n=16): # Function to extract rows, or chunks, of 16 values from the image
c_list = []
row = []
count = 0
for i in list:
row.append(i.value)
count += 1
if count%16 == 0:
c_list.append(row)
row = []
return c_list
class ActionWrapper(gym.ActionWrapper):
def __init__(self, env):
super().__init__(env)
self.action_list = [0, 6, 18, 24]#, 3] # NOOP, RIGHT, JUMP, RIGHT + JUMP, LEFT
def action(self, act):
return self.action_list[act]
class BasicWrapper(gym.Wrapper):
def __init__(self, env):
super().__init__(env)
self.env = env
self.total_reward = 0
self.max_reward = 0
self.frame = 0
self.mario_old_pos = [40,176] # Starting position of Mario on env.reset()
self.t_old = 400 # Counting down from 400
def reset_init(self):
self.total_reward = 0
self.max_reward = 0
self.frame = 0
self.mario_old_pos = [40,176]
self.t_old = 400
def check_alive(self, ram): # Check if dying or dead
if SMB._is_alive(ram):
return 0
else:
return -15
def step(self, action):
next_state, reward, done, info = self.env.step(action)
ram = self.env.get_ram()
done = SMB._got_flag(ram) # Check if Mario finished level and got flag
mario_pos = SMB.get_mario_location_in_level(ram) # Grab Mario location in level
vx = (int(mario_pos[0]) - int(self.mario_old_pos[0])) * 2
if vx > 10:
vx = 0
self.mario_old_pos = mario_pos
d = self.check_alive(ram)
timer = info.get('time')
c = timer - self.t_old
self.t_old = timer
reward = vx + vy + c + d
self.total_reward += reward
if self.total_reward > self.max_reward:
self.max_reward = self.total_reward
# Reset if Mario is stuck
if self.frame > 100:
if d < 0 or (self.total_reward < self.max_reward - 0.1*self.total_reward):
self.reset_init()
self.env.reset()
self.frame += 1
return next_state, reward, done, info
The reward here is stolen from https://github.com/Kautenja/gym-super-mario-bros, while the observations are inspired by https://chrispresso.io/AI_Learns_To_Play_SMB_Using_GA_And_NN.
I feel like I am missing something essential. Any suggestions for how to proceed?

DQN with prioritized experience replay and target network does not improve

I tried to code a neural network to solve OpenAI's CartPole environment with Tensorflow and Keras. The network uses prioritized experience replay and a seperate target network which is updated every tenth episode. Here's the code:
import numpy as np
import gym
import matplotlib.pyplot as plt
from collections import deque
from tensorflow import keras
class agent:
def __init__(self,inputs,outputs,gamma):
self.gamma = gamma
self.epsilon = 1.0
self.epsilon_decay = 0.999
self.epsilon_min = 0.01
self.inputs = inputs
self.outputs = outputs
self.network = self._build_net()
self.target_network = self._build_net()
self.replay_memory = deque(maxlen=100000)
self.target_network.set_weights(self.network.get_weights())
def _build_net(self):
model = keras.models.Sequential()
model.add(keras.layers.Dense(10,activation='relu',input_dim=self.inputs))
model.add(keras.layers.Dense(10,activation='relu'))
model.add(keras.layers.Dense(10,activation='relu'))
model.add(keras.layers.Dense(self.outputs,activation='sigmoid'))
model.compile(optimizer='adam',loss='categorical_crossentropy')
return model
def act(self,state,testing=False):
if not testing:
if self.epsilon > np.random.rand():
return np.random.randint(self.outputs)
else:
return np.argmax(self.network.predict(np.array([state])))
else:
return np.argmax(self.network.predict(np.array([state])))
def remember(self,state,action,next_state,reward,win):
self.replay_memory.append((state,action,next_state,reward,win))
def get_batch(self):
batch = []
losses = []
targets = []
if len(self.replay_memory) >= 32:
for state,action,next_state,reward,done in self.replay_memory:
target_f = np.zeros([1,self.outputs])
if done != False:
target = (reward + self.gamma * np.amax(self.target_network.predict(np.array([next_state]))[0]))
target_f[0][action] = target
targets.append(target_f)
loss = np.mean(self.network.predict(np.array([state]))-target_f)**2
losses.append(loss)
indexes = np.argsort(losses)[:32]
for indx in indexes:
batch.append((self.replay_memory[indx][0], targets[indx]))
return batch
def replay(self,batch):
for state,target in batch:
self.network.fit(np.array([state]), target, epochs=1, verbose=0)
self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)
def update_target_network(self):
self.target_network.set_weights(self.network.get_weights())
def save(self):
self.network.save_weights('./DQN.model')
def load(self):
self.network.load_weights('./DQN.model')
env = gym.make('CartPole-v0')
episodes = 1000
agent = agent(env.observation_space.shape[0],env.action_space.n,0.99)
rewards = []
state = env.reset()
for _ in range(500):
action = agent.act(state,1.0)
next_state, reward, win, _ = env.step(action)
agent.remember(state,action,next_state,reward,win)
state = next_state
if win:
state = env.reset()
for e in range(episodes):
print('episode:',e+1)
batch = agent.get_batch()
state = env.reset()
for t in range(400):
action = agent.act(state)
next_state, reward, win, _ = env.step(action)
agent.remember(state,action,next_state,reward,win)
state = next_state
if win:
break
agent.replay(batch)
print('score:',t)
print('epsilon:',agent.epsilon)
print('')
if e%10 == 0:
agent.update_target_network()
agent.save()
rewards.append(t)
plt.plot(list(range(e+1)),rewards)
plt.savefig('./reward.png')
The problem is that the agent gets worse as epsilon decreases. And when epsilon is a its lowest value, the agent only gets through 7-9 steps before the Pole falls, as seen in the Image below. Can someone tell me why my agent isn't learning anything and how to fix it?
Your network output should be using linear as activation as opposed to sigmoid, since you are bootstrapping summation of all future rewards. The loss should be changed to mse as well, according to the derivation for the td error from the original paper.
Also, if the experience ended in a terminal state (done==True), the td target for the action taken should set equal to the reward (you are currently setting it as 0).
Try making the reward = -reward if done. It helps

Categories

Resources