I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
# initialize the variance for OU process for exploring policies
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
# compute TD error i.e actual - predicted values
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
# train the critic network with adam optimizer
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
# saver
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
def build_actor_network(self, s, scope, trainable):
# Actor DPG
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 150:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break
I solved this problem completely by rewriting the code and adding the learning function in a separate session
Related
I've been doing a very simply binary cat/dog classification project with machine learning. I understand the problem of overfitting, but what's strange in my case is that the validation loss begins to rise from the very beginning. I've tried many different sets of hyperparameters, with L2 regularization, learning rate decay and stochastic gradient descent, and a large training set, but the issue remained. Here is the learning graph from one of the trials (the horizontal axis should be per 10 epochs):
The hyperparameters are: two hidden layers with 50 and 10 units, initial alpha = 0.05, alpha decay rate = 0.95 per 50 epochs, mini-batch size = 64, lambda = 0.05
Here are other sample learning graphs:
I developed my model on the basis of what's provided in Andrew Ng's Deep Learning Specialization, so I didn't expect many bugs. My full code, as required, is attached below:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set, which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path, category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal, img), cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array, (img_size, img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array, categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1, np.array(Y_train).shape[0]))
def create_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:, mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:, mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X, select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:, mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:, mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X, last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer), in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z, Z)
return A,Z
def forward_propagation(X, parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W, A_prev) + b
A, activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev, W, b) #linear_cache contains A[l-1], W[l], b[l].
cache = (linear_cache, activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W, A) + b
AL, activation_cache = sigmoid(Z)
linear_cache = (A, W, b)
cache = (linear_cache, activation_cache)
caches.append(cache)
return AL, caches
def compute_cost(AL, Y, parameters, lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ, linear_cache, lambd):
A_prev, W, b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
def relu_gradient(Z):
dZ = np.where(Z > 0, 1, 0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA, cache, lambd, A, Y, activation):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches, lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(_, cache_final_layer, lambd, AL, Y, activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)], current_cache, lambd, _, _, activation='relu')
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, layers_dims, learning_rate, num_epoch, mini_batch_size, lambd, k):
mini_batches = create_mini_batches(X_train, Y_train, mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X, minibatch_Y) = mini_batch
AL, caches = forward_propagation(minibatch_X, parameters)
J_train = compute_cost(AL, minibatch_Y, parameters, lambd)
grads = L_model_backward(AL, minibatch_Y, caches, lambd)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i, J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters, costs_train, costs_dev
parameters_updated, costs_train, costs_dev = Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, [6400, 50, 10, 1], 0.05, 1000, 64, 0.05, 0.95)
I would really be grateful for anyone who is patient enough to read through my code. If the problem is still overfitting, could you offer some advice as to how to address this issue? I'm at a loss here because the validation loss goes up at a very early stage, so early stopping would cause underfitting by preventing the model from learning more deeply. Any advice would be appreciated.
When Validation Loss starts to increase in early beginning like images you added, it means that there's there is something wrong in the model.
It's not clear what's it as you didn't show your model.
You could check the following links that will help you:
Basic Cats vs Dogs Detailed Example in Colab
Detailed explanation for Over-fitting in TF Tutorial
or add your full code
I am trying to interface CasADi and Tensorflow. CasADi is a toolbox that uses symbolic variables and does automatic differentiation. It is often used for dynamic/static optimization problems.
I found an example where GPflow is used (https://web.casadi.org/blog/tensorflow/). In this case, the GP model is firstly trained with data as follows
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))
model = gpflow.models.GPR(data, value, gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
gpflow.train.ScipyOptimizer().minimize(model)
Then the prediction model is build without passing the real values but a tensor
X = tf.placeholder(shape=(1,nd),dtype=np.float64)
[mean,_] = model._build_predict(X)
Such that CasADi can substitute real values by using a callback function that calls tensorflow.
I want to use the tf.keras.Sequential() model instead of a GPflow model since I want to implement a recurrent neural network. But for the sequential model the method _build_predict(X) does not exist. I tried to use just predict but I get the following error
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype double and shape [35039,1,8]
[[{{node Placeholder}}]]
Do you know what is the equivalent in this case?
Here the complete code using GPflow
from casadi import *
T = 10. # Time horizon
N = 20 # number of control intervals
# Declare model variables
x1 = MX.sym('x1')
x2 = MX.sym('x2')
x = vertcat(x1, x2)
u = MX.sym('u')
# Model equations
xdot = vertcat((1-x2**2)*x1 - x2 + u, x1)
# Formulate discrete time dynamics
if False:
# CVODES from the SUNDIALS suite
dae = {'x':x, 'p':u, 'ode':xdot}
opts = {'tf':T/N}
F = integrator('F', 'cvodes', dae, opts)
else:
# Fixed step Runge-Kutta 4 integrator
M = 4 # RK4 steps per interval
DT = T/N/M
f = Function('f', [x, u], [xdot])
X0 = MX.sym('X0', 2)
U = MX.sym('U')
X = X0
Q = 0
for j in range(M):
k1 = f(X, U)
k2 = f(X + DT/2 * k1, U)
k3 = f(X + DT/2 * k2, U)
k4 = f(X + DT * k3, U)
X=X+DT/6*(k1 +2*k2 +2*k3 +k4)
F = Function('F', [X0, U], [X],['x0','p'],['xf'])
# Start with an empty NLP
w=[]
w0 = []
lbw = []
ubw = []
g=[]
lbg = []
ubg = []
# "Lift" initial conditions
Xk = MX.sym('X0', 2)
w += [Xk]
lbw += [0, 1]
ubw += [0, 1]
w0 += [0, 1]
# Formulate the NLP
for k in range(N):
# New NLP variable for the control
Uk = MX.sym('U_' + str(k))
w += [Uk]
lbw += [-1]
ubw += [1]
w0 += [0]
# Integrate till the end of the interval
Fk = F(x0=Xk, p=Uk)
Xk_end = Fk['xf']
# New NLP variable for state at end of interval
Xk = MX.sym('X_' + str(k+1), 2)
w += [Xk]
lbw += [-0.25, -inf]
ubw += [ inf, inf]
w0 += [0, 0]
# Add equality constraint
g += [Xk_end-Xk]
lbg += [0, 0]
ubg += [0, 0]
nd = N+1
import gpflow
import time
from tensorflow_casadi import TensorFlowEvaluator
class GPR(TensorFlowEvaluator):
def __init__(self, model, session, opts={}):
X = tf.placeholder(shape=(1,nd),dtype=np.float64)
[mean,_] = model._build_predict(X)
mean = tf.reshape(mean,(1,1))
TensorFlowEvaluator.__init__(self,[X],[mean],session,opts)
self.counter = 0
self.time = 0
def eval(self,arg):
self.counter += 1
t0 = time.time()
ret = TensorFlowEvaluator.eval(self,arg)
self.time += time.time()-t0
return [ret]
# Create
np.random.seed(0)
data = np.random.normal(loc=0.5,scale=1,size=(N,nd))
value = np.random.random((N,1))
model = gpflow.models.GPR(data, value, gpflow.kernels.Constant(nd) + gpflow.kernels.Linear(nd) + gpflow.kernels.White(nd) + gpflow.kernels.RBF(nd))
gpflow.train.ScipyOptimizer().minimize(model)
import tensorflow as tf
with tf.Session() as session:
model.initialize()
GPR = GPR(model, session)
w = vertcat(*w)
# Create an NLP solver
prob = {'f': GPR(w[0::3]), 'x': w , 'g': vertcat(*g)}
options = {"ipopt": {"hessian_approximation": "limited-memory"}}
solver = nlpsol('solver', 'ipopt', prob,options);
# Solve the NLP
sol = solver(x0=w0, lbx=lbw, ubx=ubw, lbg=lbg, ubg=ubg)
print("Ncalls",GPR.counter)
print("Total time [s]",GPR.time)
w_opt = sol['x'].full().flatten()
# Plot the solution
x1_opt = w_opt[0::3]
x2_opt = w_opt[1::3]
u_opt = w_opt[2::3]
tgrid = [T/N*k for k in range(N+1)]
import matplotlib.pyplot as plt
plt.figure(1)
plt.clf()
plt.plot(tgrid, x1_opt, '--')
plt.plot(tgrid, x2_opt, '-')
plt.step(tgrid, vertcat(DM.nan(1), u_opt), '-.')
plt.xlabel('t')
plt.legend(['x1','x2','u'])
plt.grid()
plt.show()
and the class TensorFlowEvaluator
import casadi
import tensorflow as tf
class TensorFlowEvaluator(casadi.Callback):
def __init__(self,t_in,t_out,session, opts={}):
"""
t_in: list of inputs (tensorflow placeholders)
t_out: list of outputs (tensors dependeant on those placeholders)
session: a tensorflow session
"""
casadi.Callback.__init__(self)
assert isinstance(t_in,list)
self.t_in = t_in
assert isinstance(t_out,list)
self.t_out = t_out
self.construct("TensorFlowEvaluator", opts)
self.session = session
self.refs = []
def get_n_in(self): return len(self.t_in)
def get_n_out(self): return len(self.t_out)
def get_sparsity_in(self,i):
return casadi.Sparsity.dense(*self.t_in[i].get_shape().as_list())
def get_sparsity_out(self,i):
return casadi.Sparsity.dense(*self.t_out[i].get_shape().as_list())
def eval(self,arg):
# Associate each tensorflow input with the numerical argument passed by CasADi
d = dict((v,arg[i].toarray()) for i,v in enumerate(self.t_in))
# Evaluate the tensorflow expressions
ret = self.session.run(self.t_out,feed_dict=d)
return ret
# Vanilla tensorflow offers just the reverse mode AD
def has_reverse(self,nadj): return nadj==1
def get_reverse(self,nadj,name,inames,onames,opts):
# Construct tensorflow placeholders for the reverse seeds
adj_seed = [tf.placeholder(shape=self.sparsity_out(i).shape,dtype=tf.float64) for i in range(self.n_out())]
# Construct the reverse tensorflow graph through 'gradients'
grad = tf.gradients(self.t_out, self.t_in,grad_ys=adj_seed)
# Create another TensorFlowEvaluator object
callback = TensorFlowEvaluator(self.t_in+adj_seed,grad,self.session)
# Make sure you keep a reference to it
self.refs.append(callback)
# Package it in the nominal_in+nominal_out+adj_seed form that CasADi expects
nominal_in = self.mx_in()
nominal_out = self.mx_out()
adj_seed = self.mx_out()
return casadi.Function(name,nominal_in+nominal_out+adj_seed,callback.call(nominal_in+adj_seed),inames,onames)
if __name__=="__main__":
from casadi import *
a = tf.placeholder(shape=(2,2),dtype=tf.float64)
b = tf.placeholder(shape=(2,1),dtype=tf.float64)
y = tf.matmul(tf.sin(a), b)
with tf.Session() as session:
f_tf = TensorFlowEvaluator([a,b], [y], session)
a = MX.sym("a",2,2)
b = MX.sym("a",2,1)
y = f_tf(a,b)
yref = mtimes(sin(a),b)
f = Function('f',[a,b],[y])
fref = Function('f',[a,b],[yref])
print(f(DM([[1,2],[3,4]]),DM([[1],[3]])))
print(fref(DM([[1,2],[3,4]]),DM([[1],[3]])))
f = Function('f',[a,b],[jacobian(y,a)])
fref = Function('f',[a,b],[jacobian(yref,a)])
print(f(DM([[1,2],[3,4]]),DM([[1],[3]])))
print(fref(DM([[1,2],[3,4]]),DM([[1],[3]])))
And here is my attempt:
# design network
model = tf.keras.Sequential()
LSTM = tf.keras.layers.LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]))
model.add(LSTM) #, input_shape=(train_X.shape[1], train_X.shape[2]))
model.add(tf.keras.layers.Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=0, shuffle=False)
with tf.Session() as session:
testXshape = test_X.shape
GPR = GPR(model, session,testXshape)
Thanks!
I've let the TensorFlowEvaluator the same and created the GPR class this way:
class ValFcn(TensorFlowEvaluator):
import tensorflow as tf
def __init__(self, NN, session, opts={}):
self.X = self.tf.placeholder(shape=(1,4), dtype=self.tf.float32)
self.output = NN(self.X)
TensorFlowEvaluator.__init__(self, [self.X], [self.output], session, opts)
def eval(self, arg):
ret = TensorFlowEvaluator.eval(self, arg)
return ret
I was working with float32 so I had to change it there and in the TensorFlowEvaluator.
I'm actually using this model as a cost function term for an OCP.
Hope it works!
Hi I'm trying to train a DQN to solve gym's Cartpole problem.
For some reason the Loss looks like this (orange line). Can y'all take a look at my code and help with this? I've played around with the hyperparameters a decent bit so I don't think they're the issue here.
class DQN(nn.Module):
def __init__(self, input_dim, output_dim):
super(DQN, self).__init__()
self.linear1 = nn.Linear(input_dim, 16)
self.linear2 = nn.Linear(16, 32)
self.linear3 = nn.Linear(32, 32)
self.linear4 = nn.Linear(32, output_dim)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
return self.linear4(x)
final_epsilon = 0.05
initial_epsilon = 1
epsilon_decay = 5000
global steps_done
steps_done = 0
def select_action(state):
global steps_done
sample = random.random()
eps_threshold = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
if sample > eps_threshold:
with torch.no_grad():
state = torch.Tensor(state)
steps_done += 1
q_calc = model(state)
node_activated = int(torch.argmax(q_calc))
return node_activated
else:
node_activated = random.randint(0,1)
steps_done += 1
return node_activated
class ReplayMemory(object): # Stores [state, reward, action, next_state, done]
def __init__(self, capacity):
self.capacity = capacity
self.memory = [[],[],[],[],[]]
def push(self, data):
"""Saves a transition."""
for idx, point in enumerate(data):
#print("Col {} appended {}".format(idx, point))
self.memory[idx].append(point)
def sample(self, batch_size):
rows = random.sample(range(0, len(self.memory[0])), batch_size)
experiences = [[],[],[],[],[]]
for row in rows:
for col in range(5):
experiences[col].append(self.memory[col][row])
return experiences
def __len__(self):
return len(self.memory[0])
input_dim, output_dim = 4, 2
model = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(model.state_dict())
target_net.eval()
tau = 2
discount = 0.99
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
memory = ReplayMemory(65536)
BATCH_SIZE = 128
def optimize_model():
if len(memory) < BATCH_SIZE:
return 0
experiences = memory.sample(BATCH_SIZE)
state_batch = torch.Tensor(experiences[0])
action_batch = torch.LongTensor(experiences[1]).unsqueeze(1)
reward_batch = torch.Tensor(experiences[2])
next_state_batch = torch.Tensor(experiences[3])
done_batch = experiences[4]
pred_q = model(state_batch).gather(1, action_batch)
next_state_q_vals = torch.zeros(BATCH_SIZE)
for idx, next_state in enumerate(next_state_batch):
if done_batch[idx] == True:
next_state_q_vals[idx] = -1
else:
# .max in pytorch returns (values, idx), we only want vals
next_state_q_vals[idx] = (target_net(next_state_batch[idx]).max(0)[0]).detach()
better_pred = (reward_batch + next_state_q_vals).unsqueeze(1)
loss = F.smooth_l1_loss(pred_q, better_pred)
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
return loss
points = []
losspoints = []
#save_state = torch.load("models/DQN_target_11.pth")
#model.load_state_dict(save_state['state_dict'])
#optimizer.load_state_dict(save_state['optimizer'])
env = gym.make('CartPole-v0')
for i_episode in range(5000):
observation = env.reset()
episode_loss = 0
if episode % tau == 0:
target_net.load_state_dict(model.state_dict())
for t in range(1000):
#env.render()
state = observation
action = select_action(observation)
observation, reward, done, _ = env.step(action)
if done:
next_state = [0,0,0,0]
else:
next_state = observation
memory.push([state, action, reward, next_state, done])
episode_loss = episode_loss + float(optimize_model(i_episode))
if done:
points.append((i_episode, t+1))
print("Episode {} finished after {} timesteps".format(i_episode, t+1))
print("Avg Loss: ", episode_loss / (t+1))
losspoints.append((i_episode, episode_loss / (t+1)))
if (i_episode % 100 == 0):
eps = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
print(eps)
if ((i_episode+1) % 5001 == 0):
save = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
torch.save(save, "models/DQN_target_" + str(i_episode // 5000) + ".pth")
break
env.close()
x = [coord[0] * 100 for coord in points]
y = [coord[1] for coord in points]
x2 = [coord[0] * 100 for coord in losspoints]
y2 = [coord[1] for coord in losspoints]
plt.plot(x, y)
plt.plot(x2, y2)
plt.show()
I basically followed the tutorial pytorch has, except using the state returned by the env rather than the pixels. I also changed the replay memory because I was having issues there. Other than that, I left everything else pretty much the same.
Edit:
I tried overfitting on a small batch and the Loss looks like this without updating the target net and this when updating it
Edit 2:
This is definitely an issue with the target net, I tried removing it and loss seemed to not increase exponentially
Your tau value is too small, small target network update cause DQN traning unstable. You can try to use 1000 (OpenAI Baseline's DQN example) or 10000 (Deepmind's Nature paper).
In Deepmind's 2015 Nature paper, it states that:
The second modification to online Q-learning aimed at further improving the stability of our method with neural networks is to use a separate network for generating the traget yj in the Q-learning update. More precisely, every C updates we clone the network Q to obtain a target network Q' and use Q' for generating the Q-learning targets yj for the following C updates to Q.
This modification makes the algorithm more stable compared to standard online Q-learning, where an update that increases Q(st,at) often also increases Q(st+1, a) for all a and hence also increases the target yj, possibly leading to oscillations or divergence of the policy. Generating the targets using the older set of parameters adds a delay between the time an update to Q is made and the time the update affects the targets yj, making divergence or oscillations much more unlikely.
Human-level control through deep reinforcement
learning, Mnih et al., 2015
I've run your code with settings of tau=2, tau=10, tau=100, tau=1000 and tau=10000. The update frequency of tau=100 solves the problem (reach maximum steps of 200).
tau=2
tau=10
tau=100
tau=1000
tau=10000
Below is the modified version of your code.
import random
import math
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import gym
class DQN(nn.Module):
def __init__(self, input_dim, output_dim):
super(DQN, self).__init__()
self.linear1 = nn.Linear(input_dim, 16)
self.linear2 = nn.Linear(16, 32)
self.linear3 = nn.Linear(32, 32)
self.linear4 = nn.Linear(32, output_dim)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
return self.linear4(x)
final_epsilon = 0.05
initial_epsilon = 1
epsilon_decay = 5000
global steps_done
steps_done = 0
def select_action(state):
global steps_done
sample = random.random()
eps_threshold = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
if sample > eps_threshold:
with torch.no_grad():
state = torch.Tensor(state)
steps_done += 1
q_calc = model(state)
node_activated = int(torch.argmax(q_calc))
return node_activated
else:
node_activated = random.randint(0,1)
steps_done += 1
return node_activated
class ReplayMemory(object): # Stores [state, reward, action, next_state, done]
def __init__(self, capacity):
self.capacity = capacity
self.memory = [[],[],[],[],[]]
def push(self, data):
"""Saves a transition."""
for idx, point in enumerate(data):
#print("Col {} appended {}".format(idx, point))
self.memory[idx].append(point)
def sample(self, batch_size):
rows = random.sample(range(0, len(self.memory[0])), batch_size)
experiences = [[],[],[],[],[]]
for row in rows:
for col in range(5):
experiences[col].append(self.memory[col][row])
return experiences
def __len__(self):
return len(self.memory[0])
input_dim, output_dim = 4, 2
model = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(model.state_dict())
target_net.eval()
tau = 100
discount = 0.99
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
memory = ReplayMemory(65536)
BATCH_SIZE = 128
def optimize_model():
if len(memory) < BATCH_SIZE:
return 0
experiences = memory.sample(BATCH_SIZE)
state_batch = torch.Tensor(experiences[0])
action_batch = torch.LongTensor(experiences[1]).unsqueeze(1)
reward_batch = torch.Tensor(experiences[2])
next_state_batch = torch.Tensor(experiences[3])
done_batch = experiences[4]
pred_q = model(state_batch).gather(1, action_batch)
next_state_q_vals = torch.zeros(BATCH_SIZE)
for idx, next_state in enumerate(next_state_batch):
if done_batch[idx] == True:
next_state_q_vals[idx] = -1
else:
# .max in pytorch returns (values, idx), we only want vals
next_state_q_vals[idx] = (target_net(next_state_batch[idx]).max(0)[0]).detach()
better_pred = (reward_batch + next_state_q_vals).unsqueeze(1)
loss = F.smooth_l1_loss(pred_q, better_pred)
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
return loss
points = []
losspoints = []
#save_state = torch.load("models/DQN_target_11.pth")
#model.load_state_dict(save_state['state_dict'])
#optimizer.load_state_dict(save_state['optimizer'])
env = gym.make('CartPole-v0')
for i_episode in range(5000):
observation = env.reset()
episode_loss = 0
if i_episode % tau == 0:
target_net.load_state_dict(model.state_dict())
for t in range(1000):
#env.render()
state = observation
action = select_action(observation)
observation, reward, done, _ = env.step(action)
if done:
next_state = [0,0,0,0]
else:
next_state = observation
memory.push([state, action, reward, next_state, done])
episode_loss = episode_loss + float(optimize_model())
if done:
points.append((i_episode, t+1))
print("Episode {} finished after {} timesteps".format(i_episode, t+1))
print("Avg Loss: ", episode_loss / (t+1))
losspoints.append((i_episode, episode_loss / (t+1)))
if (i_episode % 100 == 0):
eps = final_epsilon + (initial_epsilon - final_epsilon) * \
math.exp(-1. * steps_done / epsilon_decay)
print(eps)
if ((i_episode+1) % 5001 == 0):
save = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
torch.save(save, "models/DQN_target_" + str(i_episode // 5000) + ".pth")
break
env.close()
x = [coord[0] * 100 for coord in points]
y = [coord[1] for coord in points]
x2 = [coord[0] * 100 for coord in losspoints]
y2 = [coord[1] for coord in losspoints]
plt.plot(x, y)
plt.plot(x2, y2)
plt.show()
Here's the result of your plotting code.
tau=100
tau=10000
I've posted the following to the Pytorch discussion board too. I'm trying to keep the learning-rates per parameter for the already existing parameters when adding more neurons (to existing layers, not new layers) to a network. I’ve written the following class which allows me to add neurons to hidden layers during training:
import torch
import torch.nn as nn
class DQN(nn.Module):
def __init__(self, num_inputs, hidden, num_actions, non_linearity):
super(DQN, self).__init__()
self.num_inputs = num_inputs
self.hidden = hidden
self.num_actions = num_actions
self.non_linearity = non_linearity
self.layers = nn.ModuleList()
self.layers.append(nn.Linear(num_inputs, self.hidden[0]))
previous = self.hidden[0]
for hidden_layer_size in self.hidden[1:]:
self.layers.append(nn.Linear(previous, hidden_layer_size))
previous = hidden_layer_size
self.layers.append(nn.Linear(previous, num_actions))
def forward(self, x):
for i in range(len(self.layers) - 1):
x = self.non_linearity(self.layers[i](x))
return self.layers[-1](x)
def increase_capacity(self, increment):
for i in range(len(self.hidden)):
self.hidden[i] += increment[i]
bias = self.layers[0].bias.data
weight = self.layers[0].weight.data
self.layers[0] = nn.Linear(self.num_inputs, self.hidden[0])
if increment[0]>0:
self.layers[0].weight.data[0:-increment[0],:] = weight
self.layers[0].bias.data[0:-increment[0]] = bias
else:
self.layers[0].weight.data[0:,:] = weight
self.layers[0].weight.data = bias
for i in range(1, len(self.layers) - 1):
bias = self.layers[i].bias.data
weight = self.layers[i].weight.data
self.layers[i] = nn.Linear(self.hidden[i-1], self.hidden[i])
if increment[i] > 0:
if increment[i-1] >0:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:-increment[i-1]] = weight
else:
self.layers[i].bias.data[0:-increment[i]] = bias
self.layers[i].weight.data[0:-increment[i],0:] = weight
else:
if increment[i-1] >0:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:-increment[i-1]] = weight
else:
self.layers[i].bias.data = bias
self.layers[i].weight.data[0:,0:] = weight
bias = self.layers[-1].bias.data
weight = self.layers[-1].weight.data
self.layers[-1] = nn.Linear(self.hidden[-1], self.num_actions)
if increment[-1] >0:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:-increment[-1]] = weight
else:
self.layers[-1].bias.data = bias
self.layers[-1].weight.data[:,0:] = weight
def act(self, state, epsilon, mask):
if np.random.rand() > epsilon:
state = torch.tensor([state], dtype=torch.float32, device=device)
mask = torch.tensor([mask], dtype=torch.float32, device=device)
q_values = self.forward(state) + mask
action = q_values.max(1)[1].view(1, 1).item()
else:
action = np.random.randint(self.num_actions)
return action
Now I’ve written a little sanity check (whether it leads to sanity is questionable at this point): a network with 2 layers with both 1 neuron should fail to learn the x-or function, whereas a network where 4 neurons have been added should. If I initialise a new optimiser this indeed works. The optimiser I use is Adam, which keeps track of learning-rates per parameter. I’d like to keep the learning-rates of Adam for the weights and biases that already existed before I add additional neurons. The following is my failed attempt to doing so:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
# Credits to Alvations
def generate_zero():
return random.uniform(0, 49) / 100
def generate_one():
return random.uniform(50, 100) / 100
def generate_xor_XY(num_data_points):
Xs, Ys = [], []
for _ in range(num_data_points):
# xor(0, 0) -> 0
Xs.append([generate_zero(), generate_zero()]); Ys.append([0])
# xor(1, 0) -> 1
Xs.append([generate_one(), generate_zero()]); Ys.append([1])
# xor(0, 1) -> 1
Xs.append([generate_zero(), generate_one()]); Ys.append([1])
# xor(1, 1) -> 0
Xs.append([generate_one(), generate_one()]); Ys.append([0])
return Xs, Ys
# Initialisation
network = DQN(2,[1,1],1,F.relu)
# optimizer = optim.Adam(network.parameters(), amsgrad=False)
optimizer = optim.Adam(network.parameters(), amsgrad=True)
criterion = nn.MSELoss()
# Train 50000 steps to show 1 neuron cannot solve x-or task
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
# Add 5 neurons to first layer
capacity = [4,4]
network.increase_capacity(capacity)
# Uncomment the following line and comment the lines following it for normal initialisation.
# optimizer = optim.Adam(network.parameters(), amsgrad=True)
nw_param = [p for p in network.parameters()]
new_param_group = []
layer_idx = 0
for idx, group in enumerate(optimizer.param_groups):
for idx_p, p in enumerate(group['params']):
# Save previous information
prev_grad = p.grad
old_p = copy.deepcopy(p)
old_state = copy.copy(optimizer.state[p])
old_step = old_state['step']
old_exp_avg = old_state['exp_avg']
old_exp_avg_sq = old_state['exp_avg_sq']
old_max_exp_avg_sq = old_state['max_exp_avg_sq']
# Remove old parameter from state
optimizer.state.pop(p)
# Weights
if p.dim()>1:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.torch.zeros_like(p)
new_exp_avg_sq = torch.torch.zeros_like(p)
new_max_exp_avg_sq = torch.torch.zeros_like(p)
p.grad[0:prev_grad.size(0),0:prev_grad.size(1)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_max_exp_avg_sq
new_param_group.append(p)
# Biases
else:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.zeros_like(p)
new_exp_avg_sq = torch.zeros_like(p)
new_max_exp_avg_sq = torch.zeros_like(p)
p.grad[0:prev_grad.size(0)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0)] = old_max_exp_avg_sq
new_param_group.append(p)
layer_idx += 1
optimizer.param_groups[0]['params'] = new_param_group
print(network)
# Train 50000 steps to show by adding neurons the task can be solved
for i in range(50000):
optimizer.zero_grad()
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
loss.backward()
optimizer.step()
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
print(loss)
I’m trying to get the same optimizer state, but with additional parameters for the added neurons. This seems like a convoluted way of doing it (and it doesn’t work:p). Does anyone know of an (easier) way to do this or see where I’m going wrong?
I am trying to train a turtle bot simulation using a DQN. Turtle bot is supposed to find a target in a maze. It is fairly simple and it is converging. My problem is that after a couple of runs the training will get extremely slow. It is fast at the beginning but it gets very slow after 50ish runs. I have check the problem, my CPU is not even used 50% but my memory is eaten up and about 98% of memory is occupied. Somewhere in my code I am leaking memory and I think t is in the initialization of my DQN agent. Can you please guide me on what is the problem and how can I fix it.
Thanks a lot.
Here is the training code which is based on DQN with priority buffer:
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import gym_gazebo
import tensorflow as tf
import numpy as np
import time
import random
from random import *
import cv2
from gym import wrappers
from skimage import transform
import datetime
import liveplot
from dqn_agent_withTarget import DQNAgent
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#--------------------------------------------------------------------------------------------------------------------------------------
def render():
render_skip = 0 #Skip first X episodes.
render_interval = 50 #Show render Every Y episodes.
render_episodes = 10 #Show Z episodes every rendering.
if (x%render_interval == 0) and (x != 0) and (x > render_skip):
env.render()
elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
env.render(close=True)
#--------------------------------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':
#------------------------------------------------------------------------
env = gym.make('GazeboCircuit2TurtlebotLidar-v0')
outdir = '/tmp/gazebo_gym_experiments'
env = gym.wrappers.Monitor(env, outdir, force=True)
plotter = liveplot.LivePlot(outdir)
last_time_steps = np.ndarray(0)
start_time = time.time()
total_episodes = 1000
max_steps = 200
highest_reward = 0
gamma = 0.95
num_actions = 3
action_space = [0,1,2]
tf.reset_default_graph() # Reset training graph
myinit = tf.global_variables_initializer()# Initialize training network
#tf.logging.set_verbosity(tf.logging.INFO)
tf.logging.set_verbosity(tf.logging.ERROR)
#------------------------------------------------------------------------
agent = DQNAgent(action_space,"GazeboCircuit2TurtlebotLidar-v0")
agent.exploration = 1
cv2.namedWindow("window", 1)
x_val = np.random.rand(4096,256).astype(np.float32)
agent.W_fc1.load(x_val, session=agent.sess)
for e in range(total_episodes):
# reset
linecount = 0
terminal= False
win = 0
frame = 0
loss = 0.0
Q_max = 0.0
steps = 0
reward_t= 0.0
env.reset()
cumulated_rewards = 0
agent.exploration *= 0.9
if agent.exploration<0.1:
agent.exploration=0.1
_, reward, terminal, info = env.step(0)
linecount += 1
print( "Time %s, %s" %(linecount,datetime.datetime.now()))
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
linecount += 1
print( "Time %s, %s" %(linecount,datetime.datetime.now()))
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
while (not terminal):
steps += 1
state_t = state_t_1
# execute action in environment
action_t = agent.select_action(state_t, agent.exploration)
_, reward_t, terminal, info = env.step(action_t)
#print("step: ", steps, "action: ",action_t ,"reward: ", reward_t)
print(action_t , end="")
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
# store experience
agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal)
# experience replay
agent.experience_replay()
#print(agent.sess.run(agent.W_fc1))
# for log
frame += 1
loss += agent.current_loss
Q_max += np.max(agent.Q_values(state_t))
cumulated_rewards += reward_t
print(" ")
print("episodes:",e," steps:",steps," loss:",'{0:.2f}'.format(loss/(steps+1)), " terminal:",terminal, " exploration_factor:",agent.exploration , " reward:", '{0:.2f}'.format(cumulated_rewards))
plotter.plot(env)
#print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
# e, total_episodes - 1, cumulated_rewards, loss / frame, Q_max / frame))
env._flush(force=True)
# save model
weights=agent.sess.run(agent.W_fc1)
print(weights)
weights_tmp = cv2.resize(weights, (256,256), interpolation=cv2.INTER_NEAREST)
weights_image = tf.image.convert_image_dtype(weights_tmp, dtype=tf.float32)
cv2.imshow("window",agent.sess.run(weights_image))
cv2.waitKey(1)
# save model
agent.save_model()
env.close()
And here is the DQN agent code: (I think the problem is in initilizer of DQN agent code)
from collections import deque
import os
import numpy as np
import tensorflow as tf
class DQNAgent:
"""
Multi Layer Perceptron with Experience Replay
"""
def __init__(self, enable_actions, environment_name):
# parameters
self.name = os.path.splitext(os.path.basename(__file__))[0]
self.environment_name = environment_name
self.enable_actions = enable_actions
self.n_actions = len(self.enable_actions)
self.minibatch_size = 64
self.replay_memory_size = 1000
self.learning_rate = 0.001
self.discount_factor = 0.9
self.exploration = 1.0
self.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
self.model_name = "{}.ckpt".format(self.environment_name)
# replay memory
self.D = deque(maxlen=self.replay_memory_size)
# model
self.init_model()
# variables
self.current_loss = 0.0
def init_model(self):
#policy##################################################################################
# input layer (32 x 32 x 4)
self.x = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1 = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1 = tf.Variable(tf.zeros([4]))
self.c_cv1 = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1 = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1 = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1 = tf.Variable(tf.zeros([256]))
self.h_fc1 = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2 = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2 = tf.Variable(tf.zeros([32]))
self.h_fc2 = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out = tf.Variable(tf.zeros([self.n_actions]))
self.y = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y_ = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training = optimizer.minimize(self.loss)
#target######################################################################################
# input layer (32 x 32 x 4)
self.x_t = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1_t = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1_t = tf.Variable(tf.zeros([4]))
self.c_cv1_t = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1_t = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat_t = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1_t = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1_t = tf.Variable(tf.zeros([256]))
self.h_fc1_t = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2_t = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2_t = tf.Variable(tf.zeros([32]))
self.h_fc2_t = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out_t = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out_t = tf.Variable(tf.zeros([self.n_actions]))
self.y_t = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y__t = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss_t = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer_t = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training_t = optimizer.minimize(self.loss)
#general################################################################################
# saver
self.saver = tf.train.Saver()
# session
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def Q_values(self, state):
# Q(state, action) of all actions
#print("QQQ VALUES______________________________________________",self.sess.run(state))
x_tmp = self.sess.run(state)
return self.sess.run(self.y, feed_dict={self.x: x_tmp})#[0]
def select_action(self, state, epsilon):
if np.random.rand() <= epsilon:
# random
return np.random.choice(self.enable_actions)
else:
# max_action Q(state, action)
#print("G" , end="")
return self.enable_actions[np.argmax(self.Q_values(state))]
def store_experience(self, state, action, reward, state_1, terminal):
self.D.append((state, action, reward, state_1, terminal))
def experience_replay(self):
state_minibatch = []
y_minibatch = []
# sample random minibatch
minibatch_size = min(len(self.D), self.minibatch_size)
minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size)
for j in minibatch_indexes:
state_j, action_j, reward_j, state_j_1, terminal = self.D[j]
action_j_index = self.enable_actions.index(action_j)
y_j = self.Q_values(state_j)[0]
if terminal:
y_j[action_j_index] = reward_j
else:
# reward_j + gamma * max_action' Q(state', action')
y_j[action_j_index] = reward_j + self.discount_factor * np.max(self.Q_values(state_j_1)) # NOQA
x_tmp = self.sess.run(state_j)
y_j=np.reshape(y_j,(1,3))
state_minibatch.append(x_tmp[0])
y_minibatch.append(y_j[0])
# training
self.sess.run(self.training, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
# for log
self.current_loss = self.sess.run(self.loss, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
def load_model(self, model_path=None):
if model_path:
# load from model_path
self.saver.restore(self.sess, model_path)
else:
# load from checkpoint
checkpoint = tf.train.get_checkpoint_state(self.model_dir)
if checkpoint and checkpoint.model_checkpoint_path:
self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
def save_model(self):
self.saver.save(self.sess, os.path.join(self.model_dir, self.model_name))
Thanks for you help.