Deep SARSA, Agent does not work agent after training

Deep SARSA, Agent does not work agent after training - python

I have a few questions.
First of all, I am currently working on Ubuntu 18.04LTS, the python version is 3.5.6 and the keras version is 2.0.3.
What I have a problem with is that the algorithm uses the Deep SARSA algorithm, and if you change it to another env after learning, it acts like an unlearned model.
I will ask about these and attach the code as below.
deep_sarsa_agent.py
import copy
import pylab
import random
import numpy as np
from environment1 import Env1
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dropout
from keras import backend as K
import time
EPISODES = 610
# this is DeepSARSA Agent for the GridWorld
# Utilize Neural Network as q function approximator
class DeepSARSAgent:
def __init__(self):
self.load_model = False
# actions which agent can do
self.action_space = [0, 1, 2, 3, 4]
# get size of state and action
self.action_size = len(self.action_space)
self.state_size = 39
self.discount_factor = 0.99
self.learning_rate = 0.001
self.epsilon = 1. # exploration
self.epsilon_decay = .9999
self.epsilon_min = 0.01
self.model = self.build_model()
if self.load_model:
self.epsilon = 0.99
self.model.load_weights('./save_model/deep_sarsa.h5')
# approximate Q function using Neural Network
# state is input and Q Value of each action is output of network
def build_model(self):
model = Sequential()
model.add(Dense(40, input_dim=self.state_size, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(self.action_size, activation='linear'))
model.summary()
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return model
# get action from model using epsilon-greedy policy
def get_action(self, state):
if np.random.rand() <= self.epsilon:
# The agent acts randomly
return random.randrange(self.action_size)
else:
# Predict the reward value based on the given state
state = np.float32(state)
q_values = self.model.predict(state)
return np.argmax(q_values[0])
def train_model(self, state, action, reward, next_state, next_action, done):
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
state = np.float32(state)
next_state = np.float32(next_state)
target = self.model.predict(state)[0]
# like Q Learning, get maximum Q value at s'
# But from target model
if done:
target[action] = reward
else:
target[action] = (reward + self.discount_factor *
self.model.predict(next_state)[0][next_action])
target = np.reshape(target, [1, 5])
# make minibatch which includes target q value and predicted q value
# and do the model fit!
self.model.fit(state, target, epochs=1, verbose=0)
if __name__ == "__main__":
env = Env1()
agent = DeepSARSAgent()
global_step = 0
local_step = 0
scores, episodes, local_steps = [], [], []
x = 0
for e in range(EPISODES):
done = False
score = 0
state = env.reset()
state = np.reshape(state, [1, 39])
while not done:
# fresh env
global_step += 1
local_step += 1
# get action for the current state and go one step in environment
action = agent.get_action(state)
next_state, reward, done = env.step(action)
next_state = np.reshape(next_state, [1, 39])
next_action = agent.get_action(next_state)
agent.train_model(state, action, reward, next_state, next_action,
done)
state = next_state
# every time step we do training
score += reward
state = copy.deepcopy(next_state)
if done:
scores.append(score)
episodes.append(e)
local_steps.append(local_step)
pylab.plot(episodes, scores, 'b', label='scores')
pylab.plot(episodes, local_steps, 'r', label = 'local_step')
pylab.savefig("./save_graph/env case 10/1.png")
print("episode:", e, " score:", score, "global_step",
global_step, " epsilon:", agent.epsilon)
local_step = 0
if local_step >= 50 and e >= 200:
done = True
local_step = 0
if e % 100 == 0:
agent.model.save_weights("./save_model/deep_sarsa.h5")
environment1.py
import time
import numpy as np
import tkinter as tk
from PIL import ImageTk, Image
PhotoImage = ImageTk.PhotoImage
UNIT = 50 # pixels
HEIGHT = 10 # grid height
WIDTH = 10 # grid width
np.random.seed(1)
class Env1(tk.Tk):
def __init__(self):
super(Env1, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.action_size = len(self.action_space)
self.title('DeepSARSA')
self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
self.shapes = self.load_images()
self.canvas = self._build_canvas()
self.counter = 0
self.local_step = 0
self.rewards = []
self.goal = []
# obstacle
self.set_reward([3, 0], -1)
self.set_reward([0, 1], -1)
self.set_reward([6, 2], -1)
self.set_reward([4, 3], -1)
self.set_reward([5, 5], -1)
self.set_reward([0, 4], -1)
self.set_reward([1, 9], -1)
self.set_reward([8, 1], -1)
self.set_reward([2, 0], -1)
# #goal
self.set_reward([9, 9], 1)
def _build_canvas(self):
canvas = tk.Canvas(self, bg='white',
height=HEIGHT * UNIT,
width=WIDTH * UNIT)
# create grids
for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
canvas.create_line(x0, y0, x1, y1)
for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
canvas.create_line(x0, y0, x1, y1)
self.rewards = []
self.goal = []
# add image to canvas
x, y = UNIT/2, UNIT/2
self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
# pack all`
canvas.pack()
return canvas
def load_images(self):
rectangle = PhotoImage(
Image.open("./img/rectangle.png").resize((30, 30)))
triangle = PhotoImage(
Image.open("./img/triangle.png").resize((30, 30)))
circle = PhotoImage(
Image.open("./img/circle.png").resize((30, 30)))
return rectangle, triangle, circle
def reset_reward(self):
for reward in self.rewards:
self.canvas.delete(reward['figure'])
self.rewards.clear()
self.goal.clear()
self.set_reward([3, 0], -1)
self.set_reward([0, 1], -1)
self.set_reward([6, 2], -1)
self.set_reward([4, 3], -1)
self.set_reward([5, 5], -1)
self.set_reward([0, 4], -1)
self.set_reward([1, 9], -1)
self.set_reward([8, 1], -1)
self.set_reward([2, 0], -1)
# #goal
self.set_reward([9, 9], 1)
def set_reward(self, state, reward):
state = [int(state[0]), int(state[1])]
x = int(state[0])
y = int(state[1])
temp = {}
if reward > 0:
temp['reward'] = reward
temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
(UNIT * y) + UNIT / 2,
image=self.shapes[2])
self.goal.append(temp['figure'])
elif reward < 0:
temp['direction'] = -1
temp['reward'] = reward
temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
(UNIT * y) + UNIT / 2,
image=self.shapes[1])
temp['coords'] = self.canvas.coords(temp['figure'])
temp['state'] = state
self.rewards.append(temp)
# new methods
def check_if_reward(self, state):
check_list = dict()
check_list['if_goal'] = False
rewards = 0
for reward in self.rewards:
if reward['state'] == state:
rewards += reward['reward']
if reward['reward'] == 1:
check_list['if_goal'] = True
check_list['rewards'] = rewards
return check_list
def coords_to_state(self, coords):
x = int((coords[0] - UNIT / 2) / UNIT)
y = int((coords[1] - UNIT / 2) / UNIT)
return [x, y]
def reset(self):
self.update()
time.sleep(0.5)
x, y = self.canvas.coords(self.rectangle)
self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
# return observation
self.reset_reward()
return self.get_state()
def step(self, action):
self.counter += 1
self.render()
self.local_step += 1
#if self.counter % 2 == 1:
# self.rewards = self.move_rewards()
next_coords = self.move(self.rectangle, action)
check = self.check_if_reward(self.coords_to_state(next_coords))
done = check['if_goal']
reward = check['rewards']
self.canvas.tag_raise(self.rectangle)
s_ = self.get_state()
return s_, reward, done
def get_state(self):
location = self.coords_to_state(self.canvas.coords(self.rectangle))
agent_x = location[0]
agent_y = location[1]
states = list()
#locations.append(agent_x)
#locations.append(agent_y)
for reward in self.rewards:
reward_location = reward['state']
states.append(reward_location[0] - agent_x)
states.append(reward_location[1] - agent_y)
if reward['reward'] < 0:
states.append(-1)
states.append(reward['direction'])
else:
states.append(1)
return states
def move_rewards(self):
new_rewards = []
for temp in self.rewards:
if temp['reward'] == 1:
new_rewards.append(temp)
continue
temp['coords'] = self.move_const(temp)
temp['state'] = self.coords_to_state(temp['coords'])
new_rewards.append(temp)
return new_rewards
def move_const(self, target):
s = self.canvas.coords(target['figure'])
base_action = np.array([0, 0])
if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
target['direction'] = 1
elif s[0] == UNIT / 2:
target['direction'] = -1
if target['direction'] == -1:
base_action[0] += UNIT
elif target['direction'] == 1:
base_action[0] -= UNIT
if (target['figure'] is not self.rectangle
and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
base_action = np.array([0, 0])
self.canvas.move(target['figure'], base_action[0], base_action[1])
s_ = self.canvas.coords(target['figure'])
return s_
def move(self, target, action):
s = self.canvas.coords(target)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (HEIGHT - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (WIDTH - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(target, base_action[0], base_action[1])
s_ = self.canvas.coords(target)
return s_
def render(self):
time.sleep(0.07)
self.update()
As a result of thinking about them, it seemed to be biased toward only one env, so I learned an additional nine other envs, but the same result occurred.
In addition, learning may not be possible if it exceeds a certain interval during learning.

Related

Neural Network: Different number of nodes and hidden layers, but exact same test & train accuracy

I just started studying NN. In the class, the teacher gave us a code to experiment with, in google colab. I tried changing the number of nodes in each hidden layer and the number of hidden layers, and print out test accuracy and train accuracy. I've tried many configurations but the accuracies did not change. Like, it will stay exactly at 0.7857142857142857 (this is the actual number) unless I reshuffle the samples.
The teacher said that accuracy can't be changed that easily. But I don't believe her. I think there is something wrong with the code because there are too many similar digits.
Here are the codes I think are necessary to post.
Model
class Model():
def __init__(self):
self.layers = []
self.L = 0
self.W = {}
self.b = {}
self.A = {}
self.Z = {}
self.dA = {}
self.dZ = {}
self.dW = {}
self.db = {}
self.cost = 0.
self.m = 0
self.lam = 0
self.cost_history = []
self.acc_history = []
self.alpha_history = []
self.alpha = 0.
self.iterations = 0
def add_layers(self, list_of_layers):
self.layers = list_of_layers
self.L = len(self.layers) - 1 # Number of layers excluding the input feature layer
def init_params(self):
for i in range(1, self.L + 1):
self.W[str(i)] = np.random.randn(self.layers[i], self.layers[i - 1]) * np.sqrt(2. / self.layers[i - 1])
self.b[str(i)] = np.zeros((self.layers[i], 1))
def forward_prop(self, X):
self.A['0'] = X
for i in range(1, self.L + 1):
self.Z[str(i)] = np.dot(self.W[str(i)], self.A[str(i - 1)]) + self.b[str(i)]
if i == self.L:
# Output layer, Sigmoid activation
self.A[str(i)] = sigmoid(self.Z[str(i)])
else:
# Hidden layer, Relu activataion
self.A[str(i)] = relu(self.Z[str(i)])
def compute_cost(self, Y):
self.cost = -1 * np.sum(np.multiply(Y, np.log(self.A[str(self.L)])) +
np.multiply(1 - Y, np.log(1 - self.A[str(self.L)]))) / self.m
if self.lam != 0:
reg = (self.lam / (2 * self.m))
for i in range(1, self.L + 1):
reg += np.sum(np.dot(self.W[str(i)], self.W[str(i)].T))
self.cost += reg
self.cost_history.append(self.cost)
def backward_prop(self, Y):
# We need dA[str(L)] to start the backward prop computation
self.dA[str(self.L)] = -1 * (np.divide(Y, self.A[str(self.L)]) - np.divide(1 - Y, 1 - self.A[str(self.L)]))
self.dZ[str(self.L)] = np.multiply(self.dA[str(self.L)], sigmoid_derivative(self.Z[str(self.L)]))
self.dW[str(self.L)] = np.dot(self.dZ[str(self.L)], self.A[str(self.L - 1)].T) / self.m + (self.lam/self.m) * self.W[str(self.L)]
self.db[str(self.L)] = np.sum(self.dZ[str(self.L)], axis = 1, keepdims = True) / self.m
self.dA[str(self.L - 1)] = np.dot(self.W[str(self.L)].T, self.dZ[str(self.L)])
for i in reversed(range(1, self.L)):
self.dZ[str(i)] = np.multiply(self.dA[str(i)], relu_derivative(self.Z[str(i)]))
self.dW[str(i)] = np.dot(self.dZ[str(i)], self.A[str(i - 1)].T) / self.m + (self.lam/self.m) * self.W[str(i)]
self.db[str(i)] = np.sum(self.dZ[str(i)], axis = 1, keepdims = True) / self.m
self.dA[str(i - 1)] = np.dot(self.W[str(i)].T, self.dZ[str(i)])
def update_params(self):
for i in range(1, self.L + 1):
self.W[str(i)] = self.W[str(i)] - self.alpha * self.dW[str(i)]
self.b[str(i)] = self.b[str(i)] - self.alpha * self.db[str(i)]
def train(self, X, Y, iterations = 10,
alpha = 0.001, decay = True, decay_iter = 5, decay_rate = 0.9, stop_decay_counter = 100,
verbose = True, lam = 0):
self.m = Y.shape[1]
self.alpha = alpha
self.iterations = iterations
self.lam = lam
# initialize parameters
self.init_params()
for i in range(iterations):
# forward prop
self.forward_prop(X)
# compute cost
self.compute_cost(Y)
# backward prop
self.backward_prop(Y)
# update params
self.update_params()
# evaluate
self.acc_history.append(self.evaluate(X, Y, in_training = True))
# save alpha
self.alpha_history.append(self.alpha)
# learning rate decay
if decay and stop_decay_counter > 0 and i % decay_iter == 0:
self.alpha = decay_rate * self.alpha
stop_decay_counter -= 1
# display cost per iteration
if verbose:
print('Cost after {} iterations: {}'.format(i, self.cost))
def predict(self, X, in_training = False):
if in_training == False:
self.forward_prop(X)
preds = self.A[str(self.L)] >= 0.5
preds = np.squeeze(preds)
return preds
def evaluate(self, X, Y, in_training = False):
examples = X.shape[1]
pred = self.predict(X, in_training = in_training)
pred = pred.reshape(1, examples)
diff = np.sum(abs(pred - Y))
acc = (examples - np.sum(diff)) / examples
return acc
Dataset
import pandas as pd
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data', sep = ',', header = None)
data.head()
X_train = data.iloc[:,:-1]
Y_train = data.iloc[:, -1]
X_train = np.array(X_train)
Y_train = np.array(Y_train)
Y_train = Y_train.reshape(Y_train.shape[0], 1)
mean = np.mean(X_train, axis = 0)
variance = np.var(X_train, axis = 0)
X_train = np.divide((X_train - mean), variance)
Y_train = Y_train - 1
# Changing label 1 to 0 and label 2 to 1
Split & Shuffle data
# Split the data into test and train sets
from sklearn.utils import shuffle
X_train, Y_train = shuffle(X_train, Y_train)
X_test = X_train[250:,:]
Y_test = Y_train[250:,:]
X_train_ = X_train[:250,:]
Y_train_ = Y_train[:250,:]
X_train_ = X_train_.reshape(3, 250)
Y_train_ = Y_train_.reshape(1, 250)
X_test = X_test.reshape(3, 56)
Y_test = Y_test.reshape(1, 56)
Creating a Model
m = Model()
m.add_layers([3, 16, 16, 1])
m.train(X_train_, Y_train_, iterations = 5000, alpha = 0.9
, decay_iter = 10, decay_rate = 0.98, stop_decay_counter = 100
, verbose = False, lam = 2)
Evaluate
print('Test set acc = ', m.evaluate(X_test, Y_test))
print('Train set acc = ', m.evaluate(X_train_, Y_train_))
What I did in the experiment.
Shuffle, train several models (different in number of nodes and hidden layers), and evaluate
# Model examples
m.add_layers([3, 16, 16, 1, 50, 3, 25, 7, 99, 1])
m.add_layers([3, 1, 55, 19, 2, 2, 1, 1, 2, 75, 80, 3, 12, 1])
Reshuffle, evaluate
Result: Every model has the exact same train and test accuracy unless the data is reshuffled.
The teacher told me that it's just my thought, and it's not true.
Could you please tell me what is wrong to get this result?

Use q-learning method to solve knapsack problem

The question is:Sugar 1 gram for 1 dollar,cookie 7 gram for 5 dollars and ice 12 gram for 10 dollars.Now i have 29 dollars,how to buy will be the heaviest?
I have found the code on the Internet, but I don’t know how to modify it to solve my question.i don't know how to change the data to what i want.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import itertools
item = pd.DataFrame(data=[[1, 1],
[6, 2],
[18, 5],
[22, 6],
[28, 7]],
columns=['Value', 'Weight'])
actions = list(range(len(item)))
limit_W = 11
gamma = 0.9
class RLforKnapsack():
def __init__(self, limit_W, actions):
self.limit_W = limit_W # maximal weight
self.epsilon = 0.9 # e-greedy algorithm
self.gamma = 0.9 # reward decay
self.alpha = 0.8 # learning_rate
self.actions = actions
self.q_table = pd.DataFrame(columns=actions)
self.done = False
def check_state(self, knapsack):
if str(knapsack) not in self.q_table.index:
# append new state to q table
q_table_new = pd.Series([np.NAN]*len(self.actions),
index=self.q_table.columns,
name=str(knapsack))
# 0-1 knapsack
for i in list(set(self.actions).difference(set(knapsack))):
q_table_new[i] = 0
self.q_table = self.q_table.append(q_table_new)
def choose_action(self, knapsack):
self.check_state(knapsack)
state_action = self.q_table.loc[str(knapsack), :]
# random state_action in case there are two or more maximum
state_action = state_action.reindex(
np.random.permutation(state_action.index)
)
if np.random.uniform() < self.epsilon:
# choose best action
action = state_action.idxmax() # the first maximun
else:
# choose random action
action = np.random.choice(
list(set(self.actions).difference(set(knapsack)))
)
return action
def greedy_action(self, knapsack):
# testing
# choose best action
state_action = self.q_table.loc[str(knapsack), :]
state_action = state_action.reindex(
np.random.permutation(state_action.index)
)
action = state_action.idxmax()
return action
def take_action(self, knapsack, action):
# take the item
knapsack_ = knapsack + [action]
knapsack_.sort()
self.check_state(knapsack_)
return knapsack_
def rewardWithPenalty(self, knapsack_, action):
# constraint
knapsack_W = np.sum([item['Weight'][i] for i in knapsack_])
if knapsack_W > self.limit_W:
r = -10
self.done = True
else:
r = item['Value'][action]
return r
def update_qvalue(self, knapsack, knapsack_, action):
self.done = False
reward = self.rewardWithPenalty(knapsack_, action)
q_predict = self.q_table.loc[str(knapsack), action]
if len(knapsack) != len(self.actions):
q_target = reward + self.gamma * self.q_table.loc[
str(knapsack_), :].max()
else:
q_target = reward # no item can be added
self.q_table.loc[str(knapsack), action] += self.alpha * (
q_target - q_predict)
print("rl----")
print(self.q_table)
print("--------")
return self.q_table, self.done
t1 = time()
plt.close('all')
RL = RLforKnapsack(limit_W=11, actions=actions)
for episode in range(100):
print("episode--")
print(episode)
knapsack = []
for step in range(5):
print("step--")
print(step)
action = RL.choose_action(knapsack)
print("action---")
print(action)
knapsack_ = RL.take_action(knapsack, action)
q_table_RL, done = RL.update_qvalue(knapsack, knapsack_, action)
knapsack = knapsack_
if done:
break
plt.scatter(episode, q_table_RL.iloc[0, 3], c='r')
plt.scatter(episode, q_table_RL.iloc[0, 4], c='b')
t2 = time()
plt.title([t2-t1, 'RL'])
plt.show()
# %% Policy based on q table
knapsack = []
# %%
action = RL.greedy_action(knapsack)
knapsack_ = RL.take_action(knapsack, action)
knapsack = knapsack_
np.sum([item['Weight'][i] for i in knapsack_])
print(np.sum([item['Weight'][i] for i in knapsack_]))
# %%

How can I save DDPG model?

I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
# initialize the variance for OU process for exploring policies
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
# compute TD error i.e actual - predicted values
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
# train the critic network with adam optimizer
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
# saver
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
def build_actor_network(self, s, scope, trainable):
# Actor DPG
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 150:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break

I solved this problem completely by rewriting the code and adding the learning function in a separate session

Cartpole-v0 loss increasing using DQN

Hi I'm trying to train a DQN to solve gym's Cartpole problem.
For some reason the Loss looks like this (orange line). Can y'all take a look at my code and help with this? I've played around with the hyperparameters a decent bit so I don't think they're the issue here.
class DQN(nn.Module):
def __init__(self, input_dim, output_dim):
super(DQN, self).__init__()
self.linear1 = nn.Linear(input_dim, 16)
self.linear2 = nn.Linear(16, 32)
self.linear3 = nn.Linear(32, 32)
self.linear4 = nn.Linear(32, output_dim)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
return self.linear4(x)
final_epsilon = 0.05
initial_epsilon = 1
epsilon_decay = 5000
global steps_done
steps_done = 0
def select_action(state):
global steps_done
sample = random.random()
eps_threshold = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
if sample > eps_threshold:
with torch.no_grad():
state = torch.Tensor(state)
steps_done += 1
q_calc = model(state)
node_activated = int(torch.argmax(q_calc))
return node_activated
else:
node_activated = random.randint(0,1)
steps_done += 1
return node_activated
class ReplayMemory(object): # Stores [state, reward, action, next_state, done]
def __init__(self, capacity):
self.capacity = capacity
self.memory = [[],[],[],[],[]]
def push(self, data):
"""Saves a transition."""
for idx, point in enumerate(data):
#print("Col {} appended {}".format(idx, point))
self.memory[idx].append(point)
def sample(self, batch_size):
rows = random.sample(range(0, len(self.memory[0])), batch_size)
experiences = [[],[],[],[],[]]
for row in rows:
for col in range(5):
experiences[col].append(self.memory[col][row])
return experiences
def __len__(self):
return len(self.memory[0])
input_dim, output_dim = 4, 2
model = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(model.state_dict())
target_net.eval()
tau = 2
discount = 0.99
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
memory = ReplayMemory(65536)
BATCH_SIZE = 128
def optimize_model():
if len(memory) < BATCH_SIZE:
return 0
experiences = memory.sample(BATCH_SIZE)
state_batch = torch.Tensor(experiences[0])
action_batch = torch.LongTensor(experiences[1]).unsqueeze(1)
reward_batch = torch.Tensor(experiences[2])
next_state_batch = torch.Tensor(experiences[3])
done_batch = experiences[4]
pred_q = model(state_batch).gather(1, action_batch)
next_state_q_vals = torch.zeros(BATCH_SIZE)
for idx, next_state in enumerate(next_state_batch):
if done_batch[idx] == True:
next_state_q_vals[idx] = -1
else:
# .max in pytorch returns (values, idx), we only want vals
next_state_q_vals[idx] = (target_net(next_state_batch[idx]).max(0)[0]).detach()
better_pred = (reward_batch + next_state_q_vals).unsqueeze(1)
loss = F.smooth_l1_loss(pred_q, better_pred)
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
return loss
points = []
losspoints = []
#save_state = torch.load("models/DQN_target_11.pth")
#model.load_state_dict(save_state['state_dict'])
#optimizer.load_state_dict(save_state['optimizer'])
env = gym.make('CartPole-v0')
for i_episode in range(5000):
observation = env.reset()
episode_loss = 0
if episode % tau == 0:
target_net.load_state_dict(model.state_dict())
for t in range(1000):
#env.render()
state = observation
action = select_action(observation)
observation, reward, done, _ = env.step(action)
if done:
next_state = [0,0,0,0]
else:
next_state = observation
memory.push([state, action, reward, next_state, done])
episode_loss = episode_loss + float(optimize_model(i_episode))
if done:
points.append((i_episode, t+1))
print("Episode {} finished after {} timesteps".format(i_episode, t+1))
print("Avg Loss: ", episode_loss / (t+1))
losspoints.append((i_episode, episode_loss / (t+1)))
if (i_episode % 100 == 0):
eps = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
print(eps)
if ((i_episode+1) % 5001 == 0):
save = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
torch.save(save, "models/DQN_target_" + str(i_episode // 5000) + ".pth")
break
env.close()
x = [coord[0] * 100 for coord in points]
y = [coord[1] for coord in points]
x2 = [coord[0] * 100 for coord in losspoints]
y2 = [coord[1] for coord in losspoints]
plt.plot(x, y)
plt.plot(x2, y2)
plt.show()
I basically followed the tutorial pytorch has, except using the state returned by the env rather than the pixels. I also changed the replay memory because I was having issues there. Other than that, I left everything else pretty much the same.
Edit:
I tried overfitting on a small batch and the Loss looks like this without updating the target net and this when updating it
Edit 2:
This is definitely an issue with the target net, I tried removing it and loss seemed to not increase exponentially

Your tau value is too small, small target network update cause DQN traning unstable. You can try to use 1000 (OpenAI Baseline's DQN example) or 10000 (Deepmind's Nature paper).
In Deepmind's 2015 Nature paper, it states that:
The second modification to online Q-learning aimed at further improving the stability of our method with neural networks is to use a separate network for generating the traget yj in the Q-learning update. More precisely, every C updates we clone the network Q to obtain a target network Q' and use Q' for generating the Q-learning targets yj for the following C updates to Q.
This modification makes the algorithm more stable compared to standard online Q-learning, where an update that increases Q(st,at) often also increases Q(st+1, a) for all a and hence also increases the target yj, possibly leading to oscillations or divergence of the policy. Generating the targets using the older set of parameters adds a delay between the time an update to Q is made and the time the update affects the targets yj, making divergence or oscillations much more unlikely.
Human-level control through deep reinforcement
learning, Mnih et al., 2015
I've run your code with settings of tau=2, tau=10, tau=100, tau=1000 and tau=10000. The update frequency of tau=100 solves the problem (reach maximum steps of 200).
tau=2
tau=10
tau=100
tau=1000
tau=10000
Below is the modified version of your code.
import random
import math
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import gym
class DQN(nn.Module):
def __init__(self, input_dim, output_dim):
super(DQN, self).__init__()
self.linear1 = nn.Linear(input_dim, 16)
self.linear2 = nn.Linear(16, 32)
self.linear3 = nn.Linear(32, 32)
self.linear4 = nn.Linear(32, output_dim)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
return self.linear4(x)
final_epsilon = 0.05
initial_epsilon = 1
epsilon_decay = 5000
global steps_done
steps_done = 0
def select_action(state):
global steps_done
sample = random.random()
eps_threshold = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
if sample > eps_threshold:
with torch.no_grad():
state = torch.Tensor(state)
steps_done += 1
q_calc = model(state)
node_activated = int(torch.argmax(q_calc))
return node_activated
else:
node_activated = random.randint(0,1)
steps_done += 1
return node_activated
class ReplayMemory(object): # Stores [state, reward, action, next_state, done]
def __init__(self, capacity):
self.capacity = capacity
self.memory = [[],[],[],[],[]]
def push(self, data):
"""Saves a transition."""
for idx, point in enumerate(data):
#print("Col {} appended {}".format(idx, point))
self.memory[idx].append(point)
def sample(self, batch_size):
rows = random.sample(range(0, len(self.memory[0])), batch_size)
experiences = [[],[],[],[],[]]
for row in rows:
for col in range(5):
experiences[col].append(self.memory[col][row])
return experiences
def __len__(self):
return len(self.memory[0])
input_dim, output_dim = 4, 2
model = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(model.state_dict())
target_net.eval()
tau = 100
discount = 0.99
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
memory = ReplayMemory(65536)
BATCH_SIZE = 128
def optimize_model():
if len(memory) < BATCH_SIZE:
return 0
experiences = memory.sample(BATCH_SIZE)
state_batch = torch.Tensor(experiences[0])
action_batch = torch.LongTensor(experiences[1]).unsqueeze(1)
reward_batch = torch.Tensor(experiences[2])
next_state_batch = torch.Tensor(experiences[3])
done_batch = experiences[4]
pred_q = model(state_batch).gather(1, action_batch)
next_state_q_vals = torch.zeros(BATCH_SIZE)
for idx, next_state in enumerate(next_state_batch):
if done_batch[idx] == True:
next_state_q_vals[idx] = -1
else:
# .max in pytorch returns (values, idx), we only want vals
next_state_q_vals[idx] = (target_net(next_state_batch[idx]).max(0)[0]).detach()
better_pred = (reward_batch + next_state_q_vals).unsqueeze(1)
loss = F.smooth_l1_loss(pred_q, better_pred)
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
return loss
points = []
losspoints = []
#save_state = torch.load("models/DQN_target_11.pth")
#model.load_state_dict(save_state['state_dict'])
#optimizer.load_state_dict(save_state['optimizer'])
env = gym.make('CartPole-v0')
for i_episode in range(5000):
observation = env.reset()
episode_loss = 0
if i_episode % tau == 0:
target_net.load_state_dict(model.state_dict())
for t in range(1000):
#env.render()
state = observation
action = select_action(observation)
observation, reward, done, _ = env.step(action)
if done:
next_state = [0,0,0,0]
else:
next_state = observation
memory.push([state, action, reward, next_state, done])
episode_loss = episode_loss + float(optimize_model())
if done:
points.append((i_episode, t+1))
print("Episode {} finished after {} timesteps".format(i_episode, t+1))
print("Avg Loss: ", episode_loss / (t+1))
losspoints.append((i_episode, episode_loss / (t+1)))
if (i_episode % 100 == 0):
eps = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
print(eps)
if ((i_episode+1) % 5001 == 0):
save = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
torch.save(save, "models/DQN_target_" + str(i_episode // 5000) + ".pth")
break
env.close()
x = [coord[0] * 100 for coord in points]
y = [coord[1] for coord in points]
x2 = [coord[0] * 100 for coord in losspoints]
y2 = [coord[1] for coord in losspoints]
plt.plot(x, y)
plt.plot(x2, y2)
plt.show()
Here's the result of your plotting code.
tau=100
tau=10000

Training speed on GPU become slower overtime

My model training speed becomes slower over time. Every epoch take longer time to train.
Here is the full source code with my preprocess sentiment tree bank data (put glove.840B.300d.txt into data/glove).
Install some python packages:
pip install meowlogtool
pip install tqdm
Command to run:
python sentiment.py --emblr 0 --rel_dim 0 --tag_dim 0 --optim adagrad --name basic --lr 0.05 --wd 1e-4 --at_hid_dim 0
Model source code for you to read
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable as Var
import utils
import Constants
from model import SentimentModule
from embedding_model import EmbeddingModel
class SimpleGRU(nn.Module):
"""
w[i] : (300, 1)
h[i] : (150, 1)
p[i] : (20, 1)
r[i] : (20, 1)
k[i] : (150, 1)
x[i] : (20 + 150 + 300 + 20 = 490, 1) (490, 1)
Uz, Ur, Uh : (150, 150) => 67500 => (450, 450)
Wz, Wr, Wh : (150, 20 + 150 + 300 + 20) (150, 490)
"""
def __init__(self, cuda, in_dim, hid_dim):
super(SimpleGRU, self).__init__()
self.cudaFlag = cuda
self.Uz = nn.Linear(hid_dim, hid_dim)
self.Ur = nn.Linear(hid_dim, hid_dim)
self.Uh = nn.Linear(hid_dim, hid_dim)
self.Wz = nn.Linear(in_dim, hid_dim)
self.Wr = nn.Linear(in_dim, hid_dim)
self.Wh = nn.Linear(in_dim, hid_dim)
if self.cudaFlag:
self.Uz = self.Uz.cuda()
self.Ur = self.Uz.cuda()
self.Uh = self.Uz.cuda()
self.Wz = self.Wz.cuda()
self.Wr = self.Wr.cuda()
self.Wh = self.Wh.cuda()
def forward(self, x, h_prev):
"""
Simple-GRU(compress_x[v], h[t-1]) :
z[t] := s(Wz *compress_x[t]+ Uz * h[t-1] + bz)
r[t] := s(Wr * compress_x[t] + Ur * h[t-1] + br)
h_temp[t] := g(Wh * compress_x[t] + Uh * h[t-1] + bh)
h[t] := r[t] .* h[t-1] + (1 - z[t]) .* h_temp[t]
return h[t]
:param x: compress_x[t]
:param h_prev: h[t-1]
:return:
"""
z = F.sigmoid(self.Wz(x) + self.Uz(h_prev))
r = F.sigmoid(self.Wr(x) + self.Ur(h_prev))
h_temp = F.tanh(self.Wh(x) + self.Uh(h_prev))
h = r*h_prev + (1-z)*h_temp
return h
class TreeSimpleGRU(nn.Module):
def __init__(self, cuda, word_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, criterion, leaf_h = None):
super(TreeSimpleGRU, self).__init__()
self.cudaFlag = cuda
# self.gru_cell = nn.GRUCell(word_dim + tag_dim, mem_dim)
self.gru_cell = SimpleGRU(self.cudaFlag, word_dim+tag_dim, mem_dim)
self.gru_at = GRU_AT(self.cudaFlag, word_dim + tag_dim + rel_dim + mem_dim, at_hid_dim ,mem_dim)
self.mem_dim = mem_dim
self.in_dim = word_dim
self.tag_dim = tag_dim
self.rel_dim = rel_dim
self.leaf_h = leaf_h # init h for leaf node
if self.leaf_h == None:
self.leaf_h = Var(torch.rand(1, self.mem_dim))
torch.save(self.leaf_h, 'leaf_h.pth')
if self.cudaFlag:
self.leaf_h = self.leaf_h.cuda()
self.criterion = criterion
self.output_module = None
def getParameters(self):
"""
Get flatParameters
note that getParameters and parameters is not equal in this case
getParameters do not get parameters of output module
:return: 1d tensor
"""
params = []
for m in [self.gru_cell, self.gru_at]:
# we do not get param of output module
l = list(m.parameters())
params.extend(l)
one_dim = [p.view(p.numel()) for p in params]
params = F.torch.cat(one_dim)
return params
def set_output_module(self, output_module):
self.output_module = output_module
def forward(self, tree, w_emb, tag_emb, rel_emb, training = False):
loss = Var(torch.zeros(1)) # init zero loss
if self.cudaFlag:
loss = loss.cuda()
for idx in xrange(tree.num_children):
_, child_loss = self.forward(tree.children[idx], w_emb, tag_emb, rel_emb, training)
loss = loss + child_loss
if tree.num_children > 0:
child_rels, child_k = self.get_child_state(tree, rel_emb)
if self.tag_dim > 0:
tree.state = self.node_forward(w_emb[tree.idx - 1], tag_emb[tree.idx -1], child_rels, child_k)
else:
tree.state = self.node_forward(w_emb[tree.idx - 1], None, child_rels, child_k)
elif tree.num_children == 0:
if self.tag_dim > 0:
tree.state = self.leaf_forward(w_emb[tree.idx - 1], tag_emb[tree.idx -1])
else:
tree.state = self.leaf_forward(w_emb[tree.idx - 1], None)
if self.output_module != None:
output = self.output_module.forward(tree.state, training)
tree.output = output
if training and tree.gold_label != None:
target = Var(utils.map_label_to_target_sentiment(tree.gold_label))
if self.cudaFlag:
target = target.cuda()
loss = loss + self.criterion(output, target)
return tree.state, loss
def leaf_forward(self, word_emb, tag_emb):
"""
Forward function for leaf node
:param word_emb: word embedding of current node u
:param tag_emb: tag embedding of current node u
:return: k of current node u
"""
h = self.leaf_h
if self.cudaFlag:
h = h.cuda()
if self.tag_dim > 0:
x = F.torch.cat([word_emb, tag_emb], 1)
else:
x = word_emb
k = self.gru_cell(x, h)
return k
def node_forward(self, word_emb, tag_emb, child_rels, child_k):
"""
Foward function for inner node
:param word_emb: word embedding of current node u
:param tag_emb: tag embedding of current node u
:param child_rels (tensor): rels embedding of child node v
:param child_k (tensor): k of child node v
:return:
"""
n_child = child_k.size(0)
h = Var(torch.zeros(1, self.mem_dim))
if self.cudaFlag:
h = h.cuda()
for i in range(0, n_child):
k = child_k[i]
x_list = [word_emb, k]
if self.rel_dim >0:
rel = child_rels[i]
x_list.append(rel)
if self.tag_dim > 0:
x_list.append(tag_emb)
x = F.torch.cat(x_list, 1)
h = self.gru_at(x, h)
k = h
return k
def get_child_state(self, tree, rels_emb):
"""
Get child rels, get child k
:param tree: tree we need to get child
:param rels_emb (tensor):
:return:
"""
if tree.num_children == 0:
assert False # never get here
else:
child_k = Var(torch.Tensor(tree.num_children, 1, self.mem_dim))
if self.rel_dim>0:
child_rels = Var(torch.Tensor(tree.num_children, 1, self.rel_dim))
else:
child_rels = None
if self.cudaFlag:
child_k = child_k.cuda()
if self.rel_dim > 0:
child_rels = child_rels.cuda()
for idx in xrange(tree.num_children):
child_k[idx] = tree.children[idx].state
if self.rel_dim > 0:
child_rels[idx] = rels_emb[tree.children[idx].idx - 1]
return child_rels, child_k
class AT(nn.Module):
"""
AT(compress_x[v]) := sigmoid(Wa * tanh(Wb * compress_x[v] + bb) + ba)
"""
def __init__(self, cuda, in_dim, hid_dim):
super(AT, self).__init__()
self.cudaFlag = cuda
self.in_dim = in_dim
self.hid_dim = hid_dim
self.Wa = nn.Linear(hid_dim, 1)
self.Wb = nn.Linear(in_dim, hid_dim)
if self.cudaFlag:
self.Wa = self.Wa.cuda()
self.Wb = self.Wb.cuda()
def forward(self, x):
out = F.sigmoid(self.Wa(F.tanh(self.Wb(x))))
return out
class GRU_AT(nn.Module):
def __init__(self, cuda, in_dim, at_hid_dim ,mem_dim):
super(GRU_AT, self).__init__()
self.cudaFlag = cuda
self.in_dim = in_dim
self.mem_dim = mem_dim
self.at_hid_dim = at_hid_dim
if at_hid_dim > 0:
self.at = AT(cuda, in_dim, at_hid_dim)
self.gru_cell = SimpleGRU(self.cudaFlag, in_dim, mem_dim)
if self.cudaFlag:
if at_hid_dim > 0:
self.at = self.at.cuda()
self.gru_cell = self.gru_cell.cuda()
def forward(self, x, h_prev):
"""
:param x:
:param h_prev:
:return: a * m + (1 - a) * h[t-1]
"""
m = self.gru_cell(x, h_prev)
if self.at_hid_dim > 0:
a = self.at.forward(x)
h = torch.mm(a, m) + torch.mm((1-a), h_prev)
else:
h = m
return h
class TreeGRUSentiment(nn.Module):
def __init__(self, cuda, in_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, num_classes, criterion):
super(TreeGRUSentiment, self).__init__()
self.cudaFlag = cuda
self.tree_module = TreeSimpleGRU(cuda, in_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, criterion)
self.output_module = SentimentModule(cuda, mem_dim, num_classes, dropout=True)
self.tree_module.set_output_module(self.output_module)
def get_tree_parameters(self):
return self.tree_module.getParameters()
def forward(self, tree, sent_emb, tag_emb, rel_emb, training = False):
# sent_emb = F.torch.unsqueeze(self.word_embedding.forward(sent_inputs), 1)
# tag_emb = F.torch.unsqueeze(self.tag_emb.forward(tag_inputs), 1)
# rel_emb = F.torch.unsqueeze(self.rel_emb.forward(rel_inputs), 1)
# sent_emb, tag_emb, rel_emb = self.embedding_model(sent_inputs, tag_inputs, rel_inputs)
tree_state, loss = self.tree_module(tree, sent_emb, tag_emb, rel_emb, training)
output = tree.output
return output, loss

Why does neural network learning slow down as the error gets lower?
The reasons for the slowdown are not fully understood, but we have some basic ideas.
For classifiers, most training examples start out as incorrectly classified. Over time, more of them become correctly classified. Early in learning, you might have a nearly 100% error rate, so every example in the minibatch contributes to learning. Late in learning, you might have nearly a 0% error rate, so almost none of the examples in the minibatch contribute to learning. This problem can be resolved to some extent by using hard example mining or importance sampling. Both of these are just techniques for training on more difficult examples more often.
There are other more complicated reasons. One of them is that the condition number of the Hessian tends to worsen a lot as learning progresses, so that the optimal step size becomes smaller and smaller.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Deep SARSA, Agent does not work agent after training - python

Related

Neural Network: Different number of nodes and hidden layers, but exact same test & train accuracy

Use q-learning method to solve knapsack problem

How can I save DDPG model?

Cartpole-v0 loss increasing using DQN

Training speed on GPU become slower overtime

Categories

Resources