I used this gym library to try and get this model to learn, but I don't think it learns from experience. Something is wrong, but I can't figure it out.
I have played with the DISCOUNT, LEARNING_RATE, DISCRETE_OS_SIZE and still nothing, do i have to create a neural network for this example? Or can I just use the formula to derive the q values?
import gym
import numpy as np
LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 25000
env = gym.make("MountainCar-v0")
DISCRETE_OS_SIZE = [20, 20]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE
q_tables = np.random.uniform(low = -2, high = 0, size = (DISCRETE_OS_SIZE + [env.action_space.n]))
def get_discrete_state(state):
discrete_state = (state - env.observation_space.low)/DISCRETE_OS_SIZE
return tuple(discrete_state.astype(np.int)) # we use this tuple to look up the 3 Q values for the available actions in the q-table
# Exploration settings
epsilon = 1 # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)
SHOW_EVERY = 1000
done = False
for episode in range(EPISODES):
discrete_state = get_discrete_state(env.reset())
done = False
if episode % SHOW_EVERY == 0:
render = True
print(episode)
else:
render = False
while not done:
if np.random.random() > epsilon:
# Get action from Q table
action = np.argmax(q_tables[discrete_state])
else:
# Get random action
action = np.random.randint(0, env.action_space.n)
new_state, reward, done, _ = env.step(action)#state = position and velocit
new_discrete_state = get_discrete_state(new_state)
if render:
env.render()
if not done:
max_future_q = np.max(q_tables[new_discrete_state]) # Maximum possible Q value in next step (for new state)
current_q = q_tables[discrete_state + (action,)]# Current Q value (for current state and performed action)
new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)# And here's our equation for a new Q value for current state and action
q_tables[discrete_state + (action,)] = new_q# Update Q table with new Q value
# Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly
elif new_state[0] >= env.goal_position:
q_tables[discrete_state + (action,)] = 0
print(episode)
discrete_state = new_discrete_state
# Decaying is being done every episode if episode number is within decaying range
if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
epsilon -= epsilon_decay_value
env.close()
I have found what the problem was.
In this line discrete_state = (state - env.observation_space.low)/DISCRETE_OS_SIZE I am dividing the (current state - env.observation_space.low) by DISCRETE_OS_SIZE. I came to the conclusion that I am better of dividing by discrete_os_win_size, which solved my problem.
Related
I am working on a grid optimization model where I am importing data from a data-set and I am training my RL-model on a custom Reinforcement Learning environment. In this model I also want to extract my grid values and pv-values step by step. While training, my custom Reinforcement learning environment does give me values for grid and pv but when I do prediction, it outputs an empty list. I wanted to ask why is it so.
What's the problem?
For Importing the dataset
def get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00'):
# import standard load profiles
slp = pd.read_csv('df_p.csv', index_col=0, parse_dates=True)['0'] / 1000
slp = slp.resample('15min').mean() * 3
pv = pd.read_csv('Solar_Data-2011.csv', delimiter=';',
index_col=0, parse_dates=False)["Generation"] * 3
pv.index = slp.index
print("Load values:")
print(slp.values)
print("PV values:")
print(pv.values)
start = pd.to_datetime(start)
end = pd.to_datetime(end)
return slp[start:end], pv[start:end]
This is my custom-environment
class CostEnv(Env):
def __init__(self):
# Actions we can take increase in cost, lowering of cost
self.action_space = Discrete(2)
### Get input data, just choose one day for now
self.load, self.pv = get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00')
self.pv_price = 0.10
self.grid_price = 0.40
self.line_max = 15
self.grid_penalty = 100
self.battery_max = 18
self.battery_state = 10
self.pv_values = []
self.grid_values = []
###
# Set episode length
self.episode_length = len(self.load)
self.observation_space = Dict(
{
"load": Box(0, self.episode_length - 1, shape=(2,), dtype=int),
"pv": Box(0, self.episode_length - 1, shape=(2,), dtype=int),
}
)
def step(self, action):
# Apply action
# 0 -1 = -1 decrease in cost
# 1 -1 = 0 Increase in cost
# self.state += action -1
# Reduce episode length by 1 second
# self.episode_length -= 1 ###Move to the end of the action
### We calculate the reward based on the price for the electricity,
#lower price, "higher" reward
if action == 0:
# Take all electricity from grid
if self.load[len(self.load)-self.episode_length] > self.line_max:
reward = self.load[len(self.load)-self.episode_length] * self.grid_price * -1 - abs(self.load[len(self.load)-self.episode_length] - self.line_max) * self.grid_penalty
self.grid_values.append(self.load[len(self.load)-self.episode_length])
else:
reward = self.load[len(self.load)-self.episode_length] * self.grid_price * -1
self.grid_values.append(self.load[len(self.load)-self.episode_length])
elif action == 1:
# Take all electricity from pv
if self.pv[len(self.pv)-self.episode_length] >= self.load[len(self.load)-self.episode_length]:
if abs(self.pv[len(self.pv)-self.episode_length] - self.load[len(self.load)-self.episode_length]) > self.line_max:
reward = self.load[len(self.load)-self.episode_length] * self.pv_price * -1 - (self.pv[len(self.pv)-self.episode_length] - self.load[len(self.load)-self.episode_length]) * self.grid_penalty
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
reward = self.load[len(self.load)-self.episode_length] * self.pv_price * -1
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
if (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) > self.line_max:
reward = self.pv[len(self.pv)-self.episode_length] * self.pv_price * -1 - (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) * self.grid_price - (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) * self.grid_penalty
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
reward = self.pv[len(self.pv)-self.episode_length] * self.pv_price * -1
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
### This may lead the agent to always choose action 1,
###because it will always supply the demand and will always be cheaper.
else:
reward = -300000
# Invalid action
#raise ValueError(f'Invalid action: {action}')
info = {}
### Observation
observation = {
"load": (0,self.load[len(self.load)-self.episode_length]),
"pv": (0,self.pv[len(self.pv)-self.episode_length]),
}
### Either here or before checking self.episode_length
self.episode_length -= 1
### Check if timeseries is over
if self.episode_length <= 0:
done = True
else:
done = False
# Return step information
# return self.state, reward, done, info
return observation, reward, done, info
def render(self):
# Implement viz
pass
def reset(self):
self.done=False
# Set episode length
self.episode_length = len(self.load)
observation = {
"load": (0, self.load[len(self.load)-self.episode_length]),
"pv": (0, self.pv[len(self.pv)-self.episode_length]),
}
return observation
Here is my model-training
log_path = os.path.join('Training', 'Logs')
model = A2C("MultiInputPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=300000)
The values extracted by this model are as follows:
Env.grid_values: [0.4014,
0.342,
0.5357999999999999,
0.4698,
0.44999999999999996,
0.376,
0.521,
0.4293999999999999,
0.25140000000000007,
0.7412000000000001,
env.pv_values:[0.0,
0.0,
0.0,
0.0,
0.0,
0.0607460715,
0.0678108435,
0.07642341180000001,
Now for predicting, I am using another dataset which is used as follows:
def get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00'):
# import standard load profiles
slp = pd.read_csv('df_p.csv', index_col=0, parse_dates=True)['2'] / 1000
slp = slp.resample('15min').mean() * 3
pv = pd.read_csv('Solar_Data-2011.csv', delimiter=';',
index_col=0, parse_dates=False)["Generation"] * 3
pv.index = slp.index
print("Load values:")
print(slp.values)
print("PV values:")
print(pv.values)
start = pd.to_datetime(start)
end = pd.to_datetime(end)
return slp[start:end], pv[start:end]
For prediction the code is this
episodes = 20
for ep in range(episodes):
obs = env.reset()
done = False
while not done:
action = model.predict(obs)
obs, rewards, done, info = env.step(action)
env.close()
The lists are as following:
env.pv_values
[]
env.grid_values
[]
Please tell me what I am doing wrong. Also, I load the model in a separate Jupyter notebook and environment is the same that I use for training my model. The code is as follows:
model = A2C("MultiInputPolicy",env=env)
model = model.load("A2C_Multi_Input_Policy_Improved_1",env=env)
I am trying to implement policy iteration from scratch. I have a 2D grid world environment named GridWorld that returns the successor state and reward from a given action, and it also has a function that returns the transition probability. Below is my code for policy iteration:
import matplotlib
matplotlib.use('Agg')
import random
import numpy as np
import matplotlib.pyplot as plt
import gridworld
from tqdm import tqdm
class PolicyIteration:
def __init__(self, env, gamma):
self.env = env
self.num_states = self.env.num_states
self.num_actions = self.env.num_actions
self.max_num_steps = self.env.max_num_steps
self.gamma = gamma #discount factor
self.values = np.zeros(self.num_states) #Initialize `values` as zeros
self.policy = np.random.randint(0, self.num_actions, self.num_states)
def one_policy_evaluation(self):
"""
Runs one iteration of policy evaluation and updates the value function.
:return: the maximum change in value function
"""
delta = 0
for s in range(self.num_states):
v = self.values[s]
a = self.policy[s]
(s_new, r, _) = self.env.step(a)
p = self.env.p(s_new, s, a)
""" update V(s)"""
self.values[s] = np.sum(p * (r + self.gamma * self.values[s_new]))
delta = max(delta, abs(v - self.values[s]))
return delta
def run_policy_evaluation(self, tol = 1e-3):
"""
Runs policy evaluation until convergence.
:param tol: the tolerance level for convergence
:return: the number of iterations of policy evaluation until convergence
"""
delta = self.one_policy_evaluation()
delta_history = [delta]
while delta > tol:
delta = self.one_policy_evaluation()
delta_history.append(delta)
return len(delta_history)
def run_policy_improvement(self):
update_policy_count = 0
for s in range(self.num_states):
temp = self.policy[s]
v_list = np.zeros(self.num_actions)
for a in range(self.num_actions):
(s_new, r, _) = self.env.step(a)
p = self.env.p(s_new, s, a)
v_list[a] = np.sum(p * (r + self.gamma * self.values[s_new]))
self.policy[s] = np.argmax(v_list)
if temp != self.policy[s]:
update_policy_count += 1
return update_policy_count
def train(self, tol=1e-3, max_iters=100, plot=True):
eval_count = self.run_policy_evaluation(tol)
eval_count_history = [eval_count]
policy_change = self.run_policy_improvement()
policy_change_history = [policy_change]
epoch = 0
val_history= []
for i in tqdm(range(max_iters)):
epoch += 1
new_eval_count = self.run_policy_evaluation(tol)
new_policy_change = self.run_policy_improvement()
eval_count_history.append(new_eval_count)
policy_change_history.append(new_policy_change)
val_history.append(np.mean(self.values))
if new_policy_change == 0:
break
print(f'# epoch: {len(policy_change_history)}')
print(f'eval count = {eval_count_history}')
print(f'policy change = {policy_change_history}')
if plot is True:
plt.figure(dpi=200)
plt.plot(val_history)
plt.tight_layout()
plt.savefig('policy_iteration.png')
plt.show()
def main():
env = gridworld.GridWorld(hard_version=False)
agent = PolicyIteration(env, gamma=0.95)
agent.train()
if __name__ == '__main__':
main()
However, based on the figure generated, the sequence of values is oscillating up and down and never converges. I followed the algorithm in the Sutton book step by step, and can't find any issues with my code yet:
Any help is greatly appreciated!
im trying to create a RL model to train a football agent using gfootball library.
I've followed the steps from their documentation, but I keep getting this annoying error and idk what to do. I've searched already on google with no success.
This is the code that i wrote
import gfootball.env as football_env
import numpy as np
# Hyperparameters
alpha = 0.1 # learning rate
gamma = 0.99 # discount factor
epsilon = 0.1 # exploration rate
# Create environment
env = football_env.create_environment(env_name='academy_empty_goal', representation='pixels')
# Initialize Q-table
q_table = np.zeros((env.observation_space.shape[0], env.action_space.n))
for episode in range(1, 10001):
done = False
obs, reward_1 = env.reset()
total_reward = 0
while not done:
# Choose action with epsilon-greedy strategy
if np.random.uniform(0, 1) < epsilon:
action = env.action_space.sample() # explore
else:
action = np.argmax(q_table[obs]) # exploit
next_obs, reward, done, _ = env.step(action)
total_reward += reward
# Update Q-value
q_table[obs][action] = (1 - alpha) * q_table[obs][action] + \
alpha * (reward + gamma * np.max(q_table[next_obs]))
obs = next_obs
if episode % 100 == 0:
print("Episode: {}, Total Reward: {}".format(episode, total_reward))
# Save the q_table
np.save("q_table", q_table)
# Load the q_table
q_table = np.load("q_table.npy")
# Test the agent
obs = env.reset()
done = False
while not done:
action = np.argmax(q_table[obs])
obs, reward, done, info = env.step(action)
env.render()
And this is the error
My agent's objective is to control the speed of the motor. Here all state values are the rpm value of the motor and actions are defined as 0(decrease rpm by 1),1(no change) and 2(increase rpm by 1). I am using Q learning.
class SpeedControlEnv(Env): #we can access gym env
def __init__(self): #initializing actions, observation, spaces,
#Actions that we can take up, down, no change in speed
self.action_space = Discrete(3)
# Observation space to hold the current speed so our agent can take necessary action
#self.observation_space = Box(low=np.array([0]), high=np.array([100])) #box is used for continuous state space
self.observation_space = Discrete(100) #discrete observation space
#set start temp
self.state = 40 + random.randint(-30,40) #this is the start state for my agent.
# set time for our agent to complete the task before my motor blows
self.control_length = 60 # this is in seconds : So my agent has 'n' seconds to bring it in normal state
def step(self, action): #what actions agent can take in each steps
#Take action (0,1,2)
# 0 -1 = -1 #decreaase speed by 1
#1 -1 = 0 #no change
#2- 1 = 1 #increase speed by 1
self.state += action -1
#with each action reduce the time my agent has by 1
self.control_length -= 1
# assign reward
if self.state >= 40 and self.state <= 45:
reward = 1
else:
reward = -1
# check if shower is done
if self.control_length <= 0:
done = True
else:
done = False
#apply random noise
#self.state += random.randint(-3,3)
#set placeholder for information, Required by OpenAI
info = {}
#return step information
return self.state, reward, done , info
pass
def render(self): #visualization
pass
def reset(self): #reset after training or a episode
#Reset speed that is the state
self.state = 40 + random.randint(-30,40)
#Reset control time
self.control_length = 60
return self.state
#Defined my hyperparameters as
#Initialize all the hyperparameters
num_episodes = 50000 #agent plays step
#max_steps_per_episode = 60 #max steps agent can take in one episode
learning_rate = 0.1 #alpha
discount_rate = 0.99 #gamma
exploration_rate = 1 #epsilon
max_exploration_rate = 1 #max epsilon
min_exploration_rate = 0.01 #min epsilon
exploration_decay_rate = 0.01 #decaying rate of exploration
#My Q learning code is as follow
for episode in range(num_episodes):
state = env.reset()
done = False
reward_current_episode = 0
for step in range(env.control_length):
exploration_rate_threshold = random.uniform(0,1)
if exploration_rate_threshold > exploration_rate:
action = np.argmax(q_table[state,:])
else:
action = env.action_space.sample()
new_state,reward,done,info = env.step(action)
#Update Q table
q_table[state,action] = q_table[state, action]*(1-learning_rate) + learning_rate*(reward + discount_rate*np.max(q_table[new_state,:]))
state = new_state
reward_current_episode += reward
if done == True:
break
exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate)*np.exp(-exploration_decay_rate*episode)
#append rewards from current episode to the list of rewards achieved from all episode
reward_from_all_episodes.append(reward_current_episode)
each_state.append(state)
#error
IndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_6020/777037272.py in <module>
14
15 #Update Q table
---> 16 q_table[state,action] = q_table[state, action]*(1-learning_rate) + learning_rate*(reward + discount_rate*np.max(q_table[new_state,:]))
17
18 state = new_state
IndexError: index 100 is out of bounds for axis 0 with size 100
It will be great if anyone can explain to me why I am getting this error. I am new to programming and machine learning.
It looks like you're trying to index a numpy array. Arrays, and just about everything in python and programming in general, are 0-indexed. That means their indices start at 0 instead of 1, which means that the maximum index in an array with 100 items is 99.
Upon exploring actor-critic, I have been trying to speed up my program using multiprocessing. However, the code runs fine until the point where I start using processes. The code starts, but then never stops at one episode (keeps running, showing looping behavior). I have been searching for possible errors, and as far as I am concerned, the fault is due to each process calling its sub-process (?). That being said, I really want to find out how to fix the multiprocessing part so that my program will run, so I would really appreciate any help.
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import time
import random
import multiprocessing as mp
from keras.layers import Reshape, BatchNormalization
from keras.layers.embeddings import Embedding
from collections import deque
# Configuration parameters for the whole setup
gamma = 1 # Discount factor for past rewards
max_steps_per_episode = 2000
env = gym.make("Taxi-v3").env # Create the environment
eps = 1e-6
num_inputs = 1
num_actions = 6
num_hidden = 64
simulated_epsilon = 0
# Actor Policy Network
inputs_1 = layers.Input(shape=(num_inputs,))
embed = layers.Embedding(500, 10, input_length=num_inputs)(inputs_1)
reshape = layers.Reshape((10 * num_inputs, ))(embed)
common = layers.Dense(num_hidden * 2, activation="relu")(reshape)
common = layers.Dense(num_hidden, activation="relu")(common)
action = layers.Dense(num_actions, activation="softmax")(common)
model_1 = keras.Model(inputs=inputs_1, outputs=action)
# Critic Reward Network
inputs_2 = layers.Input(shape=(num_inputs,))
embed_2 = layers.Embedding(500, 10, input_length=num_inputs)(inputs_2)
reshape_2 = layers.Reshape((10, ))(embed_2)
common_2 = layers.Dense(num_hidden * 2, activation="relu")(reshape_2)
common_2 = layers.Dense(num_hidden, activation="relu")(common_2)
critic = layers.Dense(1)(common_2)
model_2 = keras.Model(inputs=inputs_2, outputs=critic)
# Optimizer and Loss Function
optimizer = keras.optimizers.Adam(learning_rate=5e-4)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
under_20 = 0
model_2.compile(optimizer = keras.optimizers.Adam(learning_rate=5e-4), loss=huber_loss)
def worker(number, env, actor, critic):
optimizer = keras.optimizers.Adam(learning_rate=5e-4)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
under_20 = 0
while True: # Run until solved
state = env.reset()
episode_reward = 0
penalties = 0
drop = 0
print("Episode {} begins ({})".format(episode_count, number))
env.render()
start = time.time()
time_solve = 0
with tf.GradientTape() as tape_1, tf.GradientTape() as tape_2:
#with tf.GradientTape() as tape:
#while True:
for _ in range(1, max_steps_per_episode + 1):
#env.render() # Adding this line would show the attempts
# of the agent in a pop up window.
state = tf.convert_to_tensor(state)
state = tf.expand_dims(state, 0)
# Predict action probabilities and estimated future rewards
# from environment state
action_probs = actor(state)
critic_value = critic(state)
critic_value_history.append((state, critic_value[0, 0]))
# Choose action
action = np.random.choice(num_actions, p=np.squeeze(action_probs))
action_probs_history.append(tf.math.log(action_probs[0, action])) # action_probs stores log of probs of action
#if timestep == 1:
# print("{}: {}".format(state, action_probs))
# print("{}: {}".format(state, action))
# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
rewards_history.append(reward)
episode_reward += reward
time_solve += 1
if reward == -10:
penalties += 1
elif reward == 20:
drop += 1
if done:
break
# Update running reward to check condition for solving
running_reward = (running_reward * (episode_count) + episode_reward) / (episode_count + 1)
# Calculate expected value from rewards
# - At each timestep what was the total reward received after that timestep
# - Rewards in the past are discounted by multiplying them with gamma
# - These are the labels for our critic
returns = deque(maxlen=3500)
discounted_sum = 0
for r in rewards_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.appendleft(discounted_sum)
# Normalize
#returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
#returns = returns.tolist()
# Calculating loss values to update our network
history = zip(action_probs_history, critic_value_history, returns)
loss_value_actor = 0
loss_value_critic = 0
for log_prob, value, ret in history:
diff = ret - value[1]
loss_value_actor += -log_prob * diff
loss_value_critic += huber_loss(tf.expand_dims(value[1], 0), tf.expand_dims(ret, 0))
# Backpropagation
#loss_value_actor /= time_solve
#loss_value_critic /= time_solve
if episode_count % 2 == 1:
grads_1 = tape_1.gradient(loss_value_actor, model_1.trainable_variables)
optimizer.apply_gradients(zip(grads_1, model_1.trainable_variables))
grads_2 = tape_2.gradient(loss_value_critic, model_2.trainable_variables)
optimizer.apply_gradients(zip(grads_2, model_2.trainable_variables))
# Clear the loss and reward history
action_probs_history.clear()
critic_value_history.clear()
rewards_history.clear()
# Copy params
actor.set_weights(model_1.get_weights())
critic.set_weights(model_2.get_weights())
# Log details
end = time.time()
episode_count += 1
if episode_count % 1 == 0:
env.render()
template = "average reward: {:.2f}"
print(template.format(running_reward, episode_count))
print("episode reward: {}".format(episode_reward))
print("Steps taken: {}".format(time_solve))
print("Penalties incurred: {}".format(penalties))
print("Passengers dropped off: {}".format(drop))
print("Time taken: {}".format(end - start))
print()
if running_reward > -50: # Condition to consider the task solved
under_20 += 1
if under_20 > 5:
print("Solved at episode {} !".format(episode_count))
break
num_processes = 5
if __name__ == "__main__":
mp.freeze_support()
envs = []
processes = []
actors = []
critics = []
for i in range(num_processes):
envs.append(gym.make("Taxi-v3").env)
for i in range(num_processes):
t = mp.Process(target=worker, args=(i, envs[i], model_1, model_2))
t.start()
time.sleep(0.5)
processes.append(t)
for process in processes:
process.join()
for process in processes:
process.terminate()