I am trying to implement policy iteration from scratch. I have a 2D grid world environment named GridWorld that returns the successor state and reward from a given action, and it also has a function that returns the transition probability. Below is my code for policy iteration:
import matplotlib
matplotlib.use('Agg')
import random
import numpy as np
import matplotlib.pyplot as plt
import gridworld
from tqdm import tqdm
class PolicyIteration:
def __init__(self, env, gamma):
self.env = env
self.num_states = self.env.num_states
self.num_actions = self.env.num_actions
self.max_num_steps = self.env.max_num_steps
self.gamma = gamma #discount factor
self.values = np.zeros(self.num_states) #Initialize `values` as zeros
self.policy = np.random.randint(0, self.num_actions, self.num_states)
def one_policy_evaluation(self):
"""
Runs one iteration of policy evaluation and updates the value function.
:return: the maximum change in value function
"""
delta = 0
for s in range(self.num_states):
v = self.values[s]
a = self.policy[s]
(s_new, r, _) = self.env.step(a)
p = self.env.p(s_new, s, a)
""" update V(s)"""
self.values[s] = np.sum(p * (r + self.gamma * self.values[s_new]))
delta = max(delta, abs(v - self.values[s]))
return delta
def run_policy_evaluation(self, tol = 1e-3):
"""
Runs policy evaluation until convergence.
:param tol: the tolerance level for convergence
:return: the number of iterations of policy evaluation until convergence
"""
delta = self.one_policy_evaluation()
delta_history = [delta]
while delta > tol:
delta = self.one_policy_evaluation()
delta_history.append(delta)
return len(delta_history)
def run_policy_improvement(self):
update_policy_count = 0
for s in range(self.num_states):
temp = self.policy[s]
v_list = np.zeros(self.num_actions)
for a in range(self.num_actions):
(s_new, r, _) = self.env.step(a)
p = self.env.p(s_new, s, a)
v_list[a] = np.sum(p * (r + self.gamma * self.values[s_new]))
self.policy[s] = np.argmax(v_list)
if temp != self.policy[s]:
update_policy_count += 1
return update_policy_count
def train(self, tol=1e-3, max_iters=100, plot=True):
eval_count = self.run_policy_evaluation(tol)
eval_count_history = [eval_count]
policy_change = self.run_policy_improvement()
policy_change_history = [policy_change]
epoch = 0
val_history= []
for i in tqdm(range(max_iters)):
epoch += 1
new_eval_count = self.run_policy_evaluation(tol)
new_policy_change = self.run_policy_improvement()
eval_count_history.append(new_eval_count)
policy_change_history.append(new_policy_change)
val_history.append(np.mean(self.values))
if new_policy_change == 0:
break
print(f'# epoch: {len(policy_change_history)}')
print(f'eval count = {eval_count_history}')
print(f'policy change = {policy_change_history}')
if plot is True:
plt.figure(dpi=200)
plt.plot(val_history)
plt.tight_layout()
plt.savefig('policy_iteration.png')
plt.show()
def main():
env = gridworld.GridWorld(hard_version=False)
agent = PolicyIteration(env, gamma=0.95)
agent.train()
if __name__ == '__main__':
main()
However, based on the figure generated, the sequence of values is oscillating up and down and never converges. I followed the algorithm in the Sutton book step by step, and can't find any issues with my code yet:
Any help is greatly appreciated!
Upon exploring actor-critic, I have been trying to speed up my program using multiprocessing. However, the code runs fine until the point where I start using processes. The code starts, but then never stops at one episode (keeps running, showing looping behavior). I have been searching for possible errors, and as far as I am concerned, the fault is due to each process calling its sub-process (?). That being said, I really want to find out how to fix the multiprocessing part so that my program will run, so I would really appreciate any help.
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import time
import random
import multiprocessing as mp
from keras.layers import Reshape, BatchNormalization
from keras.layers.embeddings import Embedding
from collections import deque
# Configuration parameters for the whole setup
gamma = 1 # Discount factor for past rewards
max_steps_per_episode = 2000
env = gym.make("Taxi-v3").env # Create the environment
eps = 1e-6
num_inputs = 1
num_actions = 6
num_hidden = 64
simulated_epsilon = 0
# Actor Policy Network
inputs_1 = layers.Input(shape=(num_inputs,))
embed = layers.Embedding(500, 10, input_length=num_inputs)(inputs_1)
reshape = layers.Reshape((10 * num_inputs, ))(embed)
common = layers.Dense(num_hidden * 2, activation="relu")(reshape)
common = layers.Dense(num_hidden, activation="relu")(common)
action = layers.Dense(num_actions, activation="softmax")(common)
model_1 = keras.Model(inputs=inputs_1, outputs=action)
# Critic Reward Network
inputs_2 = layers.Input(shape=(num_inputs,))
embed_2 = layers.Embedding(500, 10, input_length=num_inputs)(inputs_2)
reshape_2 = layers.Reshape((10, ))(embed_2)
common_2 = layers.Dense(num_hidden * 2, activation="relu")(reshape_2)
common_2 = layers.Dense(num_hidden, activation="relu")(common_2)
critic = layers.Dense(1)(common_2)
model_2 = keras.Model(inputs=inputs_2, outputs=critic)
# Optimizer and Loss Function
optimizer = keras.optimizers.Adam(learning_rate=5e-4)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
under_20 = 0
model_2.compile(optimizer = keras.optimizers.Adam(learning_rate=5e-4), loss=huber_loss)
def worker(number, env, actor, critic):
optimizer = keras.optimizers.Adam(learning_rate=5e-4)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
under_20 = 0
while True: # Run until solved
state = env.reset()
episode_reward = 0
penalties = 0
drop = 0
print("Episode {} begins ({})".format(episode_count, number))
env.render()
start = time.time()
time_solve = 0
with tf.GradientTape() as tape_1, tf.GradientTape() as tape_2:
#with tf.GradientTape() as tape:
#while True:
for _ in range(1, max_steps_per_episode + 1):
#env.render() # Adding this line would show the attempts
# of the agent in a pop up window.
state = tf.convert_to_tensor(state)
state = tf.expand_dims(state, 0)
# Predict action probabilities and estimated future rewards
# from environment state
action_probs = actor(state)
critic_value = critic(state)
critic_value_history.append((state, critic_value[0, 0]))
# Choose action
action = np.random.choice(num_actions, p=np.squeeze(action_probs))
action_probs_history.append(tf.math.log(action_probs[0, action])) # action_probs stores log of probs of action
#if timestep == 1:
# print("{}: {}".format(state, action_probs))
# print("{}: {}".format(state, action))
# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
rewards_history.append(reward)
episode_reward += reward
time_solve += 1
if reward == -10:
penalties += 1
elif reward == 20:
drop += 1
if done:
break
# Update running reward to check condition for solving
running_reward = (running_reward * (episode_count) + episode_reward) / (episode_count + 1)
# Calculate expected value from rewards
# - At each timestep what was the total reward received after that timestep
# - Rewards in the past are discounted by multiplying them with gamma
# - These are the labels for our critic
returns = deque(maxlen=3500)
discounted_sum = 0
for r in rewards_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.appendleft(discounted_sum)
# Normalize
#returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
#returns = returns.tolist()
# Calculating loss values to update our network
history = zip(action_probs_history, critic_value_history, returns)
loss_value_actor = 0
loss_value_critic = 0
for log_prob, value, ret in history:
diff = ret - value[1]
loss_value_actor += -log_prob * diff
loss_value_critic += huber_loss(tf.expand_dims(value[1], 0), tf.expand_dims(ret, 0))
# Backpropagation
#loss_value_actor /= time_solve
#loss_value_critic /= time_solve
if episode_count % 2 == 1:
grads_1 = tape_1.gradient(loss_value_actor, model_1.trainable_variables)
optimizer.apply_gradients(zip(grads_1, model_1.trainable_variables))
grads_2 = tape_2.gradient(loss_value_critic, model_2.trainable_variables)
optimizer.apply_gradients(zip(grads_2, model_2.trainable_variables))
# Clear the loss and reward history
action_probs_history.clear()
critic_value_history.clear()
rewards_history.clear()
# Copy params
actor.set_weights(model_1.get_weights())
critic.set_weights(model_2.get_weights())
# Log details
end = time.time()
episode_count += 1
if episode_count % 1 == 0:
env.render()
template = "average reward: {:.2f}"
print(template.format(running_reward, episode_count))
print("episode reward: {}".format(episode_reward))
print("Steps taken: {}".format(time_solve))
print("Penalties incurred: {}".format(penalties))
print("Passengers dropped off: {}".format(drop))
print("Time taken: {}".format(end - start))
print()
if running_reward > -50: # Condition to consider the task solved
under_20 += 1
if under_20 > 5:
print("Solved at episode {} !".format(episode_count))
break
num_processes = 5
if __name__ == "__main__":
mp.freeze_support()
envs = []
processes = []
actors = []
critics = []
for i in range(num_processes):
envs.append(gym.make("Taxi-v3").env)
for i in range(num_processes):
t = mp.Process(target=worker, args=(i, envs[i], model_1, model_2))
t.start()
time.sleep(0.5)
processes.append(t)
for process in processes:
process.join()
for process in processes:
process.terminate()
I used this gym library to try and get this model to learn, but I don't think it learns from experience. Something is wrong, but I can't figure it out.
I have played with the DISCOUNT, LEARNING_RATE, DISCRETE_OS_SIZE and still nothing, do i have to create a neural network for this example? Or can I just use the formula to derive the q values?
import gym
import numpy as np
LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 25000
env = gym.make("MountainCar-v0")
DISCRETE_OS_SIZE = [20, 20]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE
q_tables = np.random.uniform(low = -2, high = 0, size = (DISCRETE_OS_SIZE + [env.action_space.n]))
def get_discrete_state(state):
discrete_state = (state - env.observation_space.low)/DISCRETE_OS_SIZE
return tuple(discrete_state.astype(np.int)) # we use this tuple to look up the 3 Q values for the available actions in the q-table
# Exploration settings
epsilon = 1 # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)
SHOW_EVERY = 1000
done = False
for episode in range(EPISODES):
discrete_state = get_discrete_state(env.reset())
done = False
if episode % SHOW_EVERY == 0:
render = True
print(episode)
else:
render = False
while not done:
if np.random.random() > epsilon:
# Get action from Q table
action = np.argmax(q_tables[discrete_state])
else:
# Get random action
action = np.random.randint(0, env.action_space.n)
new_state, reward, done, _ = env.step(action)#state = position and velocit
new_discrete_state = get_discrete_state(new_state)
if render:
env.render()
if not done:
max_future_q = np.max(q_tables[new_discrete_state]) # Maximum possible Q value in next step (for new state)
current_q = q_tables[discrete_state + (action,)]# Current Q value (for current state and performed action)
new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)# And here's our equation for a new Q value for current state and action
q_tables[discrete_state + (action,)] = new_q# Update Q table with new Q value
# Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly
elif new_state[0] >= env.goal_position:
q_tables[discrete_state + (action,)] = 0
print(episode)
discrete_state = new_discrete_state
# Decaying is being done every episode if episode number is within decaying range
if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
epsilon -= epsilon_decay_value
env.close()
I have found what the problem was.
In this line discrete_state = (state - env.observation_space.low)/DISCRETE_OS_SIZE I am dividing the (current state - env.observation_space.low) by DISCRETE_OS_SIZE. I came to the conclusion that I am better of dividing by discrete_os_win_size, which solved my problem.
A question.
Using multiprocessing is for having a faster code.
But after using the following framework, getting the outputs takes the same time or more as it took in normal code.
import multiprocessing
def code() :
my code
if __name__ == '__main__' :
p = multiprocessing.Process(target = code)
p.start()
p.join()
because of being 2 processors laptop, after running this code the program wants me to import data two times.
The problem is time which did not make any sense in this way. I run into a long time as long as the normal code without parallelism.
import numpy as np
from scipy.integrate import odeint
from math import *
from scipy.integrate import quad
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")
#you need add the following 3 lines
from multiprocessing import Pool
from multiprocessing import Process
import multiprocessing
print("Model 4, Equation 11")
print("")
###################### STEP NUMBER #######################
N = int(input("PLEASE ENTER NUMBER OF STEP WALKS: ")) # Step walk by user
dec=int(input("NUMBER OF DECIMAL PLACES OF OUTPUTS (RECOMENDED 10-15)?"))
print("")
print("PLEASE WAIT, METROPOLIS HASTINGS IS RUNNING ... ")
print("")
def FIT():
##########################################################
od0o = np.zeros((N,))
od0o[0]=0.72
od0n = np.zeros((N,))
Mo = np.zeros((N,))
Mo[0]= 0
Mn = np.zeros((N,))
co = np.zeros((N,))
co[0]= 0.84
cn = np.zeros((N,))
bo = np.zeros((N,))
bo[0]= 0.02
bn = np.zeros((N,))
H0o = np.zeros((N,))
H0o[0]= 70
H0n = np.zeros((N,))
Orco = np.zeros((N,))
Orco[0]= 0.0003
Orcn = np.zeros((N,))
temp=1e10 # a big number
##########################################################
CovCMB=[[3.182,18.253,-1.429],
[18.253,11887.879,-193.808],
[-1.429,-193.808,4.556]] # CMB DATA
##########################################################
def OD_H(U,z,c,b,Orc):
od, H = U
Omegai = 3 * b * ((1- od - 2*(Orc)**0.5) + (1- od - 2*(Orc)**0.5)**2/(1-2*(Orc)**0.5)) #equation 10
div1=np.divide((c**2-od),(1+z),where=(1+z)!=0)
div2=np.divide(H ,(1+z),where=(1+z)!=0)
dMdt = (div1)*(6*od+6-9*(od/c**2)+ Omegai)*(1+c**2+od*(1-3/c**2))**(-1)
dHdt = (div2)*(6*od+6-9*(od/c**2)+ Omegai)*(1+c**2+od*(1-3/c**2))**(-1)
return [dMdt, dHdt]
def solution(H0,z1,z,od0,c,b,Orc):
U = odeint(OD_H,[od0,H0],[z1,z], args=(c,b,Orc))[-1]
od, H = U
return H
##########################################################
def DMCMB1(H0,z1,z,od0,c,b,Orc):
dm = 1090 * 1/solution(H0,z1,z,od0,c,b,Orc)
return dm
def R1(H0,z1,z,od0,c,b,Orc):
#r=sqrt(Om)*(70/299000)*rszstar(z,Om,Od)
r = sqrt(1-od0-2*(Orc)**0.5)*DMCMB1(H0,z1,z,od0,c,b,Orc)
return r
def lA1(H0,z1,z,od0,c,b,Orc):
la=((3.14*299000/H0)*DMCMB1(H0,z1,z,od0,c,b,Orc))/(153)
return la
def CMBMATRIX1(H0,z1,z,od0,c,b,Orc,M):
hmCMB1=[lA1(H0,z1,z,od0,c,b,Orc)-301.57, R1(H0,z1,z,od0,c,b,Orc)-1.7382+M, 0.0222-0.02262]
vmCMB1=[[lA1(H0,z1,z,od0,c,b,Orc)-301.57], [R1(H0,z1,z,od0,c,b,Orc)-1.7382], [0.0222-0.02262]]
fmCMB1=np.dot(hmCMB1,CovCMB)
smCMB1=np.dot(fmCMB1,vmCMB1)[0]
return smCMB1
######################################################
def TOTAL(H0, od0, c, b,Orc, M) :
total = CMBMATRIX1(H0,0,1090,od0,c,b,Orc,M)
return total
######################################################
################## MCMC - MH #########################
highest=0
pat='C:/Users/21/Desktop/MHT/Models/outputs'
file_path = os.path.join(pat,'Model3.txt')
file_path2 = os.path.join(pat,'Model3min.txt')
with open(file_path, 'w') as f: # DATA WILL BE SAVED IN THIS FILE, PLEASE BECAREFUL TO CHANGE THE NAME IN EACH RUN TO AVOIDE REWRITING.
with open(file_path2, 'w') as d:
for i in range (1,N):
num = 0
R = np.random.uniform(0,1)
while True:
num += 1
od0n[i] = od0o[i-1] + 0.001 * np.random.normal()
H0n[i] = H0o[i-1] + 0.01 * np.random.normal()
bn[i] = bo[i-1] + 0.001 * np.random.normal()
cn[i] = co[i-1] + 0.001 * np.random.normal()
Mn[i] = Mo[i-1] + 0.01 * np.random.normal()
Orcn[i] = Orco[i-1] + 0.00001 * np.random.normal()
L = np.exp(-0.5 * (TOTAL(H0n[i], od0n[i], cn[i], bn[i],Orcn[i], Mn[i]) - TOTAL(H0o[i-1], od0o[i-1], co[i-1], bo[i-1],Orco[i-1], Mo[i-1]))) # L = exp(-( x^2 )/2)
LL=min(1,max(L,0))
if LL>R:
od0o[i]= od0n[i]
H0o[i] = H0n[i]
bo[i] = bn[i]
co[i] = cn[i]
Mo[i] = Mn[i]
Orco[i] = Orcn[i]
chi = TOTAL(H0o[i], od0o[i], co[i], bo[i],Orco[i], Mo[i])
else:
od0o[i]= od0o[i-1]
H0o[i] = H0o[i-1]
bo[i] = bo[i-1]
co[i] = co[i-1]
Mo[i] = Mo[i-1]
Orco[i] = Orco[i-1]
chi = TOTAL(H0o[i], od0o[i], co[i], bo[i],Orco[i], Mo[i])
if (Mo[i]>0 and 0<bo[i]<0.09 and Orco[i]>0) or num>100: # constraining the value to stay in positive area
highest = max(num, highest)
break
f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\n".format(round(chi,dec),' ',round(H0o[i],dec),' ',round(od0o[i],dec),' ',
round(co[i],dec),' ',round(bo[i],dec),' ',
round(Orco[i],dec),' ',round(Mo[i],dec)))
if chi<temp:
temp=chi
aa = H0o[i]
bb = od0o[i]
cc = co[i]
dd = bo[i]
ee = Mo[i]
ff=Orco[i]
Om=1-2*sqrt(Orco[i])-od0o[i]
# minimum of chi and its parameters
d.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12},\t{13}\t{14}\n".format(round(temp,dec), "H =", round(aa,dec), "Orc=",
round(ff,dec), "OD =",round(bb,dec),"c =",
round(cc,dec),"b =", round(dd,dec),
"M =",round(ee,dec),"Om =",round(Om,dec)))
print(round(temp,dec), "H =", round(aa,dec), "Orc=",round(ff,dec), "OD =",round(bb,dec),"c =", round(cc,dec),"b =", round(dd,dec), "M =",round(ee,dec),"Om =",round(Om,dec))
#print(highest)
print("")
#test = input("Press the enter to exit...")
#print(test)
if __name__ == '__main__':
p = multiprocessing.Process(target=FIT)
p.start()
p.join()
I think you missed main concept of multiprocessing. It does not run your code faster, it just let's you run something in another process to bypass GIL (https://wiki.python.org/moin/GlobalInterpreterLock).
It can be used to parallel computing of some function for different input values, like this example from docs
from multiprocessing import Pool
def f(x):
return x*x
if __name__ == '__main__':
p = Pool(5)
print(p.map(f, [1, 2, 3]))
Which results in computing f in different processes and just each of them returning separate value
You are creating only one process and all of your other logic is sequential thats why there is no change in performance because all of your code will run sequentially.
There are two different scenarios where you can use multiprocessing
Totally Independent functionalities : If you have functionalities which are totally Independent and there is no restriction to execute them sequentially you can execute them in parallel this way none of these functionalities will have to wait for each other.
A good analogy will be reading news paper and having breakfast here no need of doing them sequentially so we can do them at the same time.
Executing same functionality for different inputs :
If you are executing a functionality repeatedly for different input then you can execute multiple instances of that functionality for several inputs at a time.
For analogy think of single ticket counter
Then think of multiple ticket counters same functionality multiple instances.
Find these scenarios in your code then try to parallelize those functionalities.
Hope it helped.