Use q-learning method to solve knapsack problem - python

The question is:Sugar 1 gram for 1 dollar,cookie 7 gram for 5 dollars and ice 12 gram for 10 dollars.Now i have 29 dollars,how to buy will be the heaviest?
I have found the code on the Internet, but I don’t know how to modify it to solve my question.i don't know how to change the data to what i want.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import itertools
item = pd.DataFrame(data=[[1, 1],
[6, 2],
[18, 5],
[22, 6],
[28, 7]],
columns=['Value', 'Weight'])
actions = list(range(len(item)))
limit_W = 11
gamma = 0.9
class RLforKnapsack():
def __init__(self, limit_W, actions):
self.limit_W = limit_W # maximal weight
self.epsilon = 0.9 # e-greedy algorithm
self.gamma = 0.9 # reward decay
self.alpha = 0.8 # learning_rate
self.actions = actions
self.q_table = pd.DataFrame(columns=actions)
self.done = False
def check_state(self, knapsack):
if str(knapsack) not in self.q_table.index:
# append new state to q table
q_table_new = pd.Series([np.NAN]*len(self.actions),
index=self.q_table.columns,
name=str(knapsack))
# 0-1 knapsack
for i in list(set(self.actions).difference(set(knapsack))):
q_table_new[i] = 0
self.q_table = self.q_table.append(q_table_new)
def choose_action(self, knapsack):
self.check_state(knapsack)
state_action = self.q_table.loc[str(knapsack), :]
# random state_action in case there are two or more maximum
state_action = state_action.reindex(
np.random.permutation(state_action.index)
)
if np.random.uniform() < self.epsilon:
# choose best action
action = state_action.idxmax() # the first maximun
else:
# choose random action
action = np.random.choice(
list(set(self.actions).difference(set(knapsack)))
)
return action
def greedy_action(self, knapsack):
# testing
# choose best action
state_action = self.q_table.loc[str(knapsack), :]
state_action = state_action.reindex(
np.random.permutation(state_action.index)
)
action = state_action.idxmax()
return action
def take_action(self, knapsack, action):
# take the item
knapsack_ = knapsack + [action]
knapsack_.sort()
self.check_state(knapsack_)
return knapsack_
def rewardWithPenalty(self, knapsack_, action):
# constraint
knapsack_W = np.sum([item['Weight'][i] for i in knapsack_])
if knapsack_W > self.limit_W:
r = -10
self.done = True
else:
r = item['Value'][action]
return r
def update_qvalue(self, knapsack, knapsack_, action):
self.done = False
reward = self.rewardWithPenalty(knapsack_, action)
q_predict = self.q_table.loc[str(knapsack), action]
if len(knapsack) != len(self.actions):
q_target = reward + self.gamma * self.q_table.loc[
str(knapsack_), :].max()
else:
q_target = reward # no item can be added
self.q_table.loc[str(knapsack), action] += self.alpha * (
q_target - q_predict)
print("rl----")
print(self.q_table)
print("--------")
return self.q_table, self.done
t1 = time()
plt.close('all')
RL = RLforKnapsack(limit_W=11, actions=actions)
for episode in range(100):
print("episode--")
print(episode)
knapsack = []
for step in range(5):
print("step--")
print(step)
action = RL.choose_action(knapsack)
print("action---")
print(action)
knapsack_ = RL.take_action(knapsack, action)
q_table_RL, done = RL.update_qvalue(knapsack, knapsack_, action)
knapsack = knapsack_
if done:
break
plt.scatter(episode, q_table_RL.iloc[0, 3], c='r')
plt.scatter(episode, q_table_RL.iloc[0, 4], c='b')
t2 = time()
plt.title([t2-t1, 'RL'])
plt.show()
# %% Policy based on q table
knapsack = []
# %%
action = RL.greedy_action(knapsack)
knapsack_ = RL.take_action(knapsack, action)
knapsack = knapsack_
np.sum([item['Weight'][i] for i in knapsack_])
print(np.sum([item['Weight'][i] for i in knapsack_]))
# %%

Related

My Custom Reinforcement Learning Environment is unable to extract values that I want

I am working on a grid optimization model where I am importing data from a data-set and I am training my RL-model on a custom Reinforcement Learning environment. In this model I also want to extract my grid values and pv-values step by step. While training, my custom Reinforcement learning environment does give me values for grid and pv but when I do prediction, it outputs an empty list. I wanted to ask why is it so.
What's the problem?
For Importing the dataset
def get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00'):
# import standard load profiles
slp = pd.read_csv('df_p.csv', index_col=0, parse_dates=True)['0'] / 1000
slp = slp.resample('15min').mean() * 3
pv = pd.read_csv('Solar_Data-2011.csv', delimiter=';',
index_col=0, parse_dates=False)["Generation"] * 3
pv.index = slp.index
print("Load values:")
print(slp.values)
print("PV values:")
print(pv.values)
start = pd.to_datetime(start)
end = pd.to_datetime(end)
return slp[start:end], pv[start:end]
This is my custom-environment
class CostEnv(Env):
def __init__(self):
# Actions we can take increase in cost, lowering of cost
self.action_space = Discrete(2)
### Get input data, just choose one day for now
self.load, self.pv = get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00')
self.pv_price = 0.10
self.grid_price = 0.40
self.line_max = 15
self.grid_penalty = 100
self.battery_max = 18
self.battery_state = 10
self.pv_values = []
self.grid_values = []
###
# Set episode length
self.episode_length = len(self.load)
self.observation_space = Dict(
{
"load": Box(0, self.episode_length - 1, shape=(2,), dtype=int),
"pv": Box(0, self.episode_length - 1, shape=(2,), dtype=int),
}
)
def step(self, action):
# Apply action
# 0 -1 = -1 decrease in cost
# 1 -1 = 0 Increase in cost
# self.state += action -1
# Reduce episode length by 1 second
# self.episode_length -= 1 ###Move to the end of the action
### We calculate the reward based on the price for the electricity,
#lower price, "higher" reward
if action == 0:
# Take all electricity from grid
if self.load[len(self.load)-self.episode_length] > self.line_max:
reward = self.load[len(self.load)-self.episode_length] * self.grid_price * -1 - abs(self.load[len(self.load)-self.episode_length] - self.line_max) * self.grid_penalty
self.grid_values.append(self.load[len(self.load)-self.episode_length])
else:
reward = self.load[len(self.load)-self.episode_length] * self.grid_price * -1
self.grid_values.append(self.load[len(self.load)-self.episode_length])
elif action == 1:
# Take all electricity from pv
if self.pv[len(self.pv)-self.episode_length] >= self.load[len(self.load)-self.episode_length]:
if abs(self.pv[len(self.pv)-self.episode_length] - self.load[len(self.load)-self.episode_length]) > self.line_max:
reward = self.load[len(self.load)-self.episode_length] * self.pv_price * -1 - (self.pv[len(self.pv)-self.episode_length] - self.load[len(self.load)-self.episode_length]) * self.grid_penalty
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
reward = self.load[len(self.load)-self.episode_length] * self.pv_price * -1
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
if (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) > self.line_max:
reward = self.pv[len(self.pv)-self.episode_length] * self.pv_price * -1 - (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) * self.grid_price - (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) * self.grid_penalty
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
reward = self.pv[len(self.pv)-self.episode_length] * self.pv_price * -1
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
### This may lead the agent to always choose action 1,
###because it will always supply the demand and will always be cheaper.
else:
reward = -300000
# Invalid action
#raise ValueError(f'Invalid action: {action}')
info = {}
### Observation
observation = {
"load": (0,self.load[len(self.load)-self.episode_length]),
"pv": (0,self.pv[len(self.pv)-self.episode_length]),
}
### Either here or before checking self.episode_length
self.episode_length -= 1
### Check if timeseries is over
if self.episode_length <= 0:
done = True
else:
done = False
# Return step information
# return self.state, reward, done, info
return observation, reward, done, info
def render(self):
# Implement viz
pass
def reset(self):
self.done=False
# Set episode length
self.episode_length = len(self.load)
observation = {
"load": (0, self.load[len(self.load)-self.episode_length]),
"pv": (0, self.pv[len(self.pv)-self.episode_length]),
}
return observation
Here is my model-training
log_path = os.path.join('Training', 'Logs')
model = A2C("MultiInputPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=300000)
The values extracted by this model are as follows:
Env.grid_values: [0.4014,
0.342,
0.5357999999999999,
0.4698,
0.44999999999999996,
0.376,
0.521,
0.4293999999999999,
0.25140000000000007,
0.7412000000000001,
env.pv_values:[0.0,
0.0,
0.0,
0.0,
0.0,
0.0607460715,
0.0678108435,
0.07642341180000001,
Now for predicting, I am using another dataset which is used as follows:
def get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00'):
# import standard load profiles
slp = pd.read_csv('df_p.csv', index_col=0, parse_dates=True)['2'] / 1000
slp = slp.resample('15min').mean() * 3
pv = pd.read_csv('Solar_Data-2011.csv', delimiter=';',
index_col=0, parse_dates=False)["Generation"] * 3
pv.index = slp.index
print("Load values:")
print(slp.values)
print("PV values:")
print(pv.values)
start = pd.to_datetime(start)
end = pd.to_datetime(end)
return slp[start:end], pv[start:end]
For prediction the code is this
episodes = 20
for ep in range(episodes):
obs = env.reset()
done = False
while not done:
action = model.predict(obs)
obs, rewards, done, info = env.step(action)
env.close()
The lists are as following:
env.pv_values
[]
env.grid_values
[]
Please tell me what I am doing wrong. Also, I load the model in a separate Jupyter notebook and environment is the same that I use for training my model. The code is as follows:
model = A2C("MultiInputPolicy",env=env)
model = model.load("A2C_Multi_Input_Policy_Improved_1",env=env)

convergence issue encountered while implementing policy iteration from scratch

I am trying to implement policy iteration from scratch. I have a 2D grid world environment named GridWorld that returns the successor state and reward from a given action, and it also has a function that returns the transition probability. Below is my code for policy iteration:
import matplotlib
matplotlib.use('Agg')
import random
import numpy as np
import matplotlib.pyplot as plt
import gridworld
from tqdm import tqdm
class PolicyIteration:
def __init__(self, env, gamma):
self.env = env
self.num_states = self.env.num_states
self.num_actions = self.env.num_actions
self.max_num_steps = self.env.max_num_steps
self.gamma = gamma #discount factor
self.values = np.zeros(self.num_states) #Initialize `values` as zeros
self.policy = np.random.randint(0, self.num_actions, self.num_states)
def one_policy_evaluation(self):
"""
Runs one iteration of policy evaluation and updates the value function.
:return: the maximum change in value function
"""
delta = 0
for s in range(self.num_states):
v = self.values[s]
a = self.policy[s]
(s_new, r, _) = self.env.step(a)
p = self.env.p(s_new, s, a)
""" update V(s)"""
self.values[s] = np.sum(p * (r + self.gamma * self.values[s_new]))
delta = max(delta, abs(v - self.values[s]))
return delta
def run_policy_evaluation(self, tol = 1e-3):
"""
Runs policy evaluation until convergence.
:param tol: the tolerance level for convergence
:return: the number of iterations of policy evaluation until convergence
"""
delta = self.one_policy_evaluation()
delta_history = [delta]
while delta > tol:
delta = self.one_policy_evaluation()
delta_history.append(delta)
return len(delta_history)
def run_policy_improvement(self):
update_policy_count = 0
for s in range(self.num_states):
temp = self.policy[s]
v_list = np.zeros(self.num_actions)
for a in range(self.num_actions):
(s_new, r, _) = self.env.step(a)
p = self.env.p(s_new, s, a)
v_list[a] = np.sum(p * (r + self.gamma * self.values[s_new]))
self.policy[s] = np.argmax(v_list)
if temp != self.policy[s]:
update_policy_count += 1
return update_policy_count
def train(self, tol=1e-3, max_iters=100, plot=True):
eval_count = self.run_policy_evaluation(tol)
eval_count_history = [eval_count]
policy_change = self.run_policy_improvement()
policy_change_history = [policy_change]
epoch = 0
val_history= []
for i in tqdm(range(max_iters)):
epoch += 1
new_eval_count = self.run_policy_evaluation(tol)
new_policy_change = self.run_policy_improvement()
eval_count_history.append(new_eval_count)
policy_change_history.append(new_policy_change)
val_history.append(np.mean(self.values))
if new_policy_change == 0:
break
print(f'# epoch: {len(policy_change_history)}')
print(f'eval count = {eval_count_history}')
print(f'policy change = {policy_change_history}')
if plot is True:
plt.figure(dpi=200)
plt.plot(val_history)
plt.tight_layout()
plt.savefig('policy_iteration.png')
plt.show()
def main():
env = gridworld.GridWorld(hard_version=False)
agent = PolicyIteration(env, gamma=0.95)
agent.train()
if __name__ == '__main__':
main()
However, based on the figure generated, the sequence of values is oscillating up and down and never converges. I followed the algorithm in the Sutton book step by step, and can't find any issues with my code yet:
Any help is greatly appreciated!

Deep SARSA, Agent does not work agent after training

I have a few questions.
First of all, I am currently working on Ubuntu 18.04LTS, the python version is 3.5.6 and the keras version is 2.0.3.
What I have a problem with is that the algorithm uses the Deep SARSA algorithm, and if you change it to another env after learning, it acts like an unlearned model.
I will ask about these and attach the code as below.
deep_sarsa_agent.py
import copy
import pylab
import random
import numpy as np
from environment1 import Env1
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dropout
from keras import backend as K
import time
EPISODES = 610
# this is DeepSARSA Agent for the GridWorld
# Utilize Neural Network as q function approximator
class DeepSARSAgent:
def __init__(self):
self.load_model = False
# actions which agent can do
self.action_space = [0, 1, 2, 3, 4]
# get size of state and action
self.action_size = len(self.action_space)
self.state_size = 39
self.discount_factor = 0.99
self.learning_rate = 0.001
self.epsilon = 1. # exploration
self.epsilon_decay = .9999
self.epsilon_min = 0.01
self.model = self.build_model()
if self.load_model:
self.epsilon = 0.99
self.model.load_weights('./save_model/deep_sarsa.h5')
# approximate Q function using Neural Network
# state is input and Q Value of each action is output of network
def build_model(self):
model = Sequential()
model.add(Dense(40, input_dim=self.state_size, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(self.action_size, activation='linear'))
model.summary()
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return model
# get action from model using epsilon-greedy policy
def get_action(self, state):
if np.random.rand() <= self.epsilon:
# The agent acts randomly
return random.randrange(self.action_size)
else:
# Predict the reward value based on the given state
state = np.float32(state)
q_values = self.model.predict(state)
return np.argmax(q_values[0])
def train_model(self, state, action, reward, next_state, next_action, done):
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
state = np.float32(state)
next_state = np.float32(next_state)
target = self.model.predict(state)[0]
# like Q Learning, get maximum Q value at s'
# But from target model
if done:
target[action] = reward
else:
target[action] = (reward + self.discount_factor *
self.model.predict(next_state)[0][next_action])
target = np.reshape(target, [1, 5])
# make minibatch which includes target q value and predicted q value
# and do the model fit!
self.model.fit(state, target, epochs=1, verbose=0)
if __name__ == "__main__":
env = Env1()
agent = DeepSARSAgent()
global_step = 0
local_step = 0
scores, episodes, local_steps = [], [], []
x = 0
for e in range(EPISODES):
done = False
score = 0
state = env.reset()
state = np.reshape(state, [1, 39])
while not done:
# fresh env
global_step += 1
local_step += 1
# get action for the current state and go one step in environment
action = agent.get_action(state)
next_state, reward, done = env.step(action)
next_state = np.reshape(next_state, [1, 39])
next_action = agent.get_action(next_state)
agent.train_model(state, action, reward, next_state, next_action,
done)
state = next_state
# every time step we do training
score += reward
state = copy.deepcopy(next_state)
if done:
scores.append(score)
episodes.append(e)
local_steps.append(local_step)
pylab.plot(episodes, scores, 'b', label='scores')
pylab.plot(episodes, local_steps, 'r', label = 'local_step')
pylab.savefig("./save_graph/env case 10/1.png")
print("episode:", e, " score:", score, "global_step",
global_step, " epsilon:", agent.epsilon)
local_step = 0
if local_step >= 50 and e >= 200:
done = True
local_step = 0
if e % 100 == 0:
agent.model.save_weights("./save_model/deep_sarsa.h5")
environment1.py
import time
import numpy as np
import tkinter as tk
from PIL import ImageTk, Image
PhotoImage = ImageTk.PhotoImage
UNIT = 50 # pixels
HEIGHT = 10 # grid height
WIDTH = 10 # grid width
np.random.seed(1)
class Env1(tk.Tk):
def __init__(self):
super(Env1, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.action_size = len(self.action_space)
self.title('DeepSARSA')
self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
self.shapes = self.load_images()
self.canvas = self._build_canvas()
self.counter = 0
self.local_step = 0
self.rewards = []
self.goal = []
# obstacle
self.set_reward([3, 0], -1)
self.set_reward([0, 1], -1)
self.set_reward([6, 2], -1)
self.set_reward([4, 3], -1)
self.set_reward([5, 5], -1)
self.set_reward([0, 4], -1)
self.set_reward([1, 9], -1)
self.set_reward([8, 1], -1)
self.set_reward([2, 0], -1)
# #goal
self.set_reward([9, 9], 1)
def _build_canvas(self):
canvas = tk.Canvas(self, bg='white',
height=HEIGHT * UNIT,
width=WIDTH * UNIT)
# create grids
for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
canvas.create_line(x0, y0, x1, y1)
for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
canvas.create_line(x0, y0, x1, y1)
self.rewards = []
self.goal = []
# add image to canvas
x, y = UNIT/2, UNIT/2
self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
# pack all`
canvas.pack()
return canvas
def load_images(self):
rectangle = PhotoImage(
Image.open("./img/rectangle.png").resize((30, 30)))
triangle = PhotoImage(
Image.open("./img/triangle.png").resize((30, 30)))
circle = PhotoImage(
Image.open("./img/circle.png").resize((30, 30)))
return rectangle, triangle, circle
def reset_reward(self):
for reward in self.rewards:
self.canvas.delete(reward['figure'])
self.rewards.clear()
self.goal.clear()
self.set_reward([3, 0], -1)
self.set_reward([0, 1], -1)
self.set_reward([6, 2], -1)
self.set_reward([4, 3], -1)
self.set_reward([5, 5], -1)
self.set_reward([0, 4], -1)
self.set_reward([1, 9], -1)
self.set_reward([8, 1], -1)
self.set_reward([2, 0], -1)
# #goal
self.set_reward([9, 9], 1)
def set_reward(self, state, reward):
state = [int(state[0]), int(state[1])]
x = int(state[0])
y = int(state[1])
temp = {}
if reward > 0:
temp['reward'] = reward
temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
(UNIT * y) + UNIT / 2,
image=self.shapes[2])
self.goal.append(temp['figure'])
elif reward < 0:
temp['direction'] = -1
temp['reward'] = reward
temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
(UNIT * y) + UNIT / 2,
image=self.shapes[1])
temp['coords'] = self.canvas.coords(temp['figure'])
temp['state'] = state
self.rewards.append(temp)
# new methods
def check_if_reward(self, state):
check_list = dict()
check_list['if_goal'] = False
rewards = 0
for reward in self.rewards:
if reward['state'] == state:
rewards += reward['reward']
if reward['reward'] == 1:
check_list['if_goal'] = True
check_list['rewards'] = rewards
return check_list
def coords_to_state(self, coords):
x = int((coords[0] - UNIT / 2) / UNIT)
y = int((coords[1] - UNIT / 2) / UNIT)
return [x, y]
def reset(self):
self.update()
time.sleep(0.5)
x, y = self.canvas.coords(self.rectangle)
self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
# return observation
self.reset_reward()
return self.get_state()
def step(self, action):
self.counter += 1
self.render()
self.local_step += 1
#if self.counter % 2 == 1:
# self.rewards = self.move_rewards()
next_coords = self.move(self.rectangle, action)
check = self.check_if_reward(self.coords_to_state(next_coords))
done = check['if_goal']
reward = check['rewards']
self.canvas.tag_raise(self.rectangle)
s_ = self.get_state()
return s_, reward, done
def get_state(self):
location = self.coords_to_state(self.canvas.coords(self.rectangle))
agent_x = location[0]
agent_y = location[1]
states = list()
#locations.append(agent_x)
#locations.append(agent_y)
for reward in self.rewards:
reward_location = reward['state']
states.append(reward_location[0] - agent_x)
states.append(reward_location[1] - agent_y)
if reward['reward'] < 0:
states.append(-1)
states.append(reward['direction'])
else:
states.append(1)
return states
def move_rewards(self):
new_rewards = []
for temp in self.rewards:
if temp['reward'] == 1:
new_rewards.append(temp)
continue
temp['coords'] = self.move_const(temp)
temp['state'] = self.coords_to_state(temp['coords'])
new_rewards.append(temp)
return new_rewards
def move_const(self, target):
s = self.canvas.coords(target['figure'])
base_action = np.array([0, 0])
if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
target['direction'] = 1
elif s[0] == UNIT / 2:
target['direction'] = -1
if target['direction'] == -1:
base_action[0] += UNIT
elif target['direction'] == 1:
base_action[0] -= UNIT
if (target['figure'] is not self.rectangle
and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
base_action = np.array([0, 0])
self.canvas.move(target['figure'], base_action[0], base_action[1])
s_ = self.canvas.coords(target['figure'])
return s_
def move(self, target, action):
s = self.canvas.coords(target)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (HEIGHT - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (WIDTH - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(target, base_action[0], base_action[1])
s_ = self.canvas.coords(target)
return s_
def render(self):
time.sleep(0.07)
self.update()
As a result of thinking about them, it seemed to be biased toward only one env, so I learned an additional nine other envs, but the same result occurred.
In addition, learning may not be possible if it exceeds a certain interval during learning.

Is there a way to successfully implement multiprocessing with Keras neural networks for reinforcement learning?

Upon exploring actor-critic, I have been trying to speed up my program using multiprocessing. However, the code runs fine until the point where I start using processes. The code starts, but then never stops at one episode (keeps running, showing looping behavior). I have been searching for possible errors, and as far as I am concerned, the fault is due to each process calling its sub-process (?). That being said, I really want to find out how to fix the multiprocessing part so that my program will run, so I would really appreciate any help.
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import time
import random
import multiprocessing as mp
from keras.layers import Reshape, BatchNormalization
from keras.layers.embeddings import Embedding
from collections import deque
# Configuration parameters for the whole setup
gamma = 1 # Discount factor for past rewards
max_steps_per_episode = 2000
env = gym.make("Taxi-v3").env # Create the environment
eps = 1e-6
num_inputs = 1
num_actions = 6
num_hidden = 64
simulated_epsilon = 0
# Actor Policy Network
inputs_1 = layers.Input(shape=(num_inputs,))
embed = layers.Embedding(500, 10, input_length=num_inputs)(inputs_1)
reshape = layers.Reshape((10 * num_inputs, ))(embed)
common = layers.Dense(num_hidden * 2, activation="relu")(reshape)
common = layers.Dense(num_hidden, activation="relu")(common)
action = layers.Dense(num_actions, activation="softmax")(common)
model_1 = keras.Model(inputs=inputs_1, outputs=action)
# Critic Reward Network
inputs_2 = layers.Input(shape=(num_inputs,))
embed_2 = layers.Embedding(500, 10, input_length=num_inputs)(inputs_2)
reshape_2 = layers.Reshape((10, ))(embed_2)
common_2 = layers.Dense(num_hidden * 2, activation="relu")(reshape_2)
common_2 = layers.Dense(num_hidden, activation="relu")(common_2)
critic = layers.Dense(1)(common_2)
model_2 = keras.Model(inputs=inputs_2, outputs=critic)
# Optimizer and Loss Function
optimizer = keras.optimizers.Adam(learning_rate=5e-4)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
under_20 = 0
model_2.compile(optimizer = keras.optimizers.Adam(learning_rate=5e-4), loss=huber_loss)
def worker(number, env, actor, critic):
optimizer = keras.optimizers.Adam(learning_rate=5e-4)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
under_20 = 0
while True: # Run until solved
state = env.reset()
episode_reward = 0
penalties = 0
drop = 0
print("Episode {} begins ({})".format(episode_count, number))
env.render()
start = time.time()
time_solve = 0
with tf.GradientTape() as tape_1, tf.GradientTape() as tape_2:
#with tf.GradientTape() as tape:
#while True:
for _ in range(1, max_steps_per_episode + 1):
#env.render() # Adding this line would show the attempts
# of the agent in a pop up window.
state = tf.convert_to_tensor(state)
state = tf.expand_dims(state, 0)
# Predict action probabilities and estimated future rewards
# from environment state
action_probs = actor(state)
critic_value = critic(state)
critic_value_history.append((state, critic_value[0, 0]))
# Choose action
action = np.random.choice(num_actions, p=np.squeeze(action_probs))
action_probs_history.append(tf.math.log(action_probs[0, action])) # action_probs stores log of probs of action
#if timestep == 1:
# print("{}: {}".format(state, action_probs))
# print("{}: {}".format(state, action))
# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
rewards_history.append(reward)
episode_reward += reward
time_solve += 1
if reward == -10:
penalties += 1
elif reward == 20:
drop += 1
if done:
break
# Update running reward to check condition for solving
running_reward = (running_reward * (episode_count) + episode_reward) / (episode_count + 1)
# Calculate expected value from rewards
# - At each timestep what was the total reward received after that timestep
# - Rewards in the past are discounted by multiplying them with gamma
# - These are the labels for our critic
returns = deque(maxlen=3500)
discounted_sum = 0
for r in rewards_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.appendleft(discounted_sum)
# Normalize
#returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
#returns = returns.tolist()
# Calculating loss values to update our network
history = zip(action_probs_history, critic_value_history, returns)
loss_value_actor = 0
loss_value_critic = 0
for log_prob, value, ret in history:
diff = ret - value[1]
loss_value_actor += -log_prob * diff
loss_value_critic += huber_loss(tf.expand_dims(value[1], 0), tf.expand_dims(ret, 0))
# Backpropagation
#loss_value_actor /= time_solve
#loss_value_critic /= time_solve
if episode_count % 2 == 1:
grads_1 = tape_1.gradient(loss_value_actor, model_1.trainable_variables)
optimizer.apply_gradients(zip(grads_1, model_1.trainable_variables))
grads_2 = tape_2.gradient(loss_value_critic, model_2.trainable_variables)
optimizer.apply_gradients(zip(grads_2, model_2.trainable_variables))
# Clear the loss and reward history
action_probs_history.clear()
critic_value_history.clear()
rewards_history.clear()
# Copy params
actor.set_weights(model_1.get_weights())
critic.set_weights(model_2.get_weights())
# Log details
end = time.time()
episode_count += 1
if episode_count % 1 == 0:
env.render()
template = "average reward: {:.2f}"
print(template.format(running_reward, episode_count))
print("episode reward: {}".format(episode_reward))
print("Steps taken: {}".format(time_solve))
print("Penalties incurred: {}".format(penalties))
print("Passengers dropped off: {}".format(drop))
print("Time taken: {}".format(end - start))
print()
if running_reward > -50: # Condition to consider the task solved
under_20 += 1
if under_20 > 5:
print("Solved at episode {} !".format(episode_count))
break
num_processes = 5
if __name__ == "__main__":
mp.freeze_support()
envs = []
processes = []
actors = []
critics = []
for i in range(num_processes):
envs.append(gym.make("Taxi-v3").env)
for i in range(num_processes):
t = mp.Process(target=worker, args=(i, envs[i], model_1, model_2))
t.start()
time.sleep(0.5)
processes.append(t)
for process in processes:
process.join()
for process in processes:
process.terminate()

Model doesn't seem to learn

I used this gym library to try and get this model to learn, but I don't think it learns from experience. Something is wrong, but I can't figure it out.
I have played with the DISCOUNT, LEARNING_RATE, DISCRETE_OS_SIZE and still nothing, do i have to create a neural network for this example? Or can I just use the formula to derive the q values?
import gym
import numpy as np
LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 25000
env = gym.make("MountainCar-v0")
DISCRETE_OS_SIZE = [20, 20]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE
q_tables = np.random.uniform(low = -2, high = 0, size = (DISCRETE_OS_SIZE + [env.action_space.n]))
def get_discrete_state(state):
discrete_state = (state - env.observation_space.low)/DISCRETE_OS_SIZE
return tuple(discrete_state.astype(np.int)) # we use this tuple to look up the 3 Q values for the available actions in the q-table
# Exploration settings
epsilon = 1 # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)
SHOW_EVERY = 1000
done = False
for episode in range(EPISODES):
discrete_state = get_discrete_state(env.reset())
done = False
if episode % SHOW_EVERY == 0:
render = True
print(episode)
else:
render = False
while not done:
if np.random.random() > epsilon:
# Get action from Q table
action = np.argmax(q_tables[discrete_state])
else:
# Get random action
action = np.random.randint(0, env.action_space.n)
new_state, reward, done, _ = env.step(action)#state = position and velocit
new_discrete_state = get_discrete_state(new_state)
if render:
env.render()
if not done:
max_future_q = np.max(q_tables[new_discrete_state]) # Maximum possible Q value in next step (for new state)
current_q = q_tables[discrete_state + (action,)]# Current Q value (for current state and performed action)
new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)# And here's our equation for a new Q value for current state and action
q_tables[discrete_state + (action,)] = new_q# Update Q table with new Q value
# Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly
elif new_state[0] >= env.goal_position:
q_tables[discrete_state + (action,)] = 0
print(episode)
discrete_state = new_discrete_state
# Decaying is being done every episode if episode number is within decaying range
if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
epsilon -= epsilon_decay_value
env.close()
I have found what the problem was.
In this line discrete_state = (state - env.observation_space.low)/DISCRETE_OS_SIZE I am dividing the (current state - env.observation_space.low) by DISCRETE_OS_SIZE. I came to the conclusion that I am better of dividing by discrete_os_win_size, which solved my problem.

Categories

Resources