I'm trying to use one_vs_one composition of decision trees for multiclass classification. The problem is, when I pass different object weights to a classifier, the result stays the same.
Do I misunderstand something with weights, or do they just work incorrectly?
Thanks for your replies!
Here is my code:
class AdaLearner(object):
def __init__(self, in_base_type, in_multi_type):
self.base_type = in_base_type
self.multi_type = in_multi_type
def train(self, in_features, in_labels):
model = AdaBoost(self.base_type, self.multi_type)
model.learn(in_features, in_labels)
return model
class AdaBoost(object):
CLASSIFIERS_NUM = 100
def __init__(self, in_base_type, in_multi_type):
self.base_type = in_base_type
self.multi_type = in_multi_type
self.classifiers = []
self.weights = []
def learn(self, in_features, in_labels):
labels_number = len(set(in_labels))
self.weights = self.get_initial_weights(in_labels)
for iteration in xrange(AdaBoost.CLASSIFIERS_NUM):
classifier = self.multi_type(self.base_type())
self.classifiers.append(classifier.train(in_features,
in_labels,
weights=self.weights))
answers = []
for obj in in_features:
answers.append(self.classifiers[-1].apply(obj))
err = self.compute_weighted_error(in_labels, answers)
print err
if abs(err - 0.) < 1e-6:
break
alpha = 0.5 * log((1 - err)/err)
self.update_weights(in_labels, answers, alpha)
self.normalize_weights()
def apply(self, in_features):
answers = {}
for classifier in self.classifiers:
answer = classifier.apply(in_features)
if answer in answers:
answers[answer] += 1
else:
answers[answer] = 1
ranked_answers = sorted(answers.iteritems(),
key=lambda (k,v): (v,k),
reverse=True)
return ranked_answers[0][0]
def compute_weighted_error(self, in_labels, in_answers):
error = 0.
w_sum = sum(self.weights)
for ind in xrange(len(in_labels)):
error += (in_answers[ind] != in_labels[ind]) * self.weights[ind] / w_sum
return error
def update_weights(self, in_labels, in_answers, in_alpha):
for ind in xrange(len(in_labels)):
self.weights[ind] *= exp(in_alpha * (in_answers[ind] != in_labels[ind]))
def normalize_weights(self):
w_sum = sum(self.weights)
for ind in xrange(len(self.weights)):
self.weights[ind] /= w_sum
def get_initial_weights(self, in_labels):
weight = 1 / float(len(in_labels))
result = []
for i in xrange(len(in_labels)):
result.append(weight)
return result
As you can see, it is just a simple AdaBoost (I instantiated it with in_base_type = tree_learner, in_multi_type = one_against_one) and it worked the same way no matter how many base classifiers were engaged. It just acted as one multiclass decision tree.
Then I've made a hack. I chose a random sample of objects on the each iteration with respect to their weights and trained classifiers with a random subset of objects without any weights. And that worked as it was supposed to.
The default tree criterion, namely information gain, does not take the weights into account. If you know of a formula which would do it, I'll implement it.
In the meanwhile, using neg_z1_loss will do it correctly. By the way, there was a slight bug in that implementation, so you will need to use the most current github master.
Related
I am developing a policy gradient NN with pytorch(version: 1.10.1) and I am having the run time error message as:
The error message as:
RuntimeError: one of the variables needed for gradient computation has been modified by an in-place operation: [torch.FloatTensor [1, 15]] is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I had read some similar discussions and people suggest trying to avoid doing a += 1. There are also some discussions that suggest downgrading the pytorch. People also suggest using the clone() instead modify the tensor. I had tried them all but I still have this error.
The error does not show everytime in the update() function. Sometimes it works well. Why does this happen?
My related code is as following, there are some weird variables names such as previous_R. I used them to avoid in place operation such as a = a + 1
NN class:
class NN(nn.Module):
"""
Feel free to change the architecture for different tasks!
"""
def __init__(self, env):
super(NN, self).__init__()
# 15 in this case
self.state_size = 15
# 31 (1 and -1 for each task-server and void) (m*n*2 + 1)
self.action_size = 31
self.linear1 = nn.Linear(self.state_size, 128)
self.linear2 = nn.Linear(128, 256)
self.linear3 = nn.Linear(256, self.action_size)
def forward(self, state):
output1 = F.relu(self.linear1(state))
output2 = F.relu(self.linear2(output1.clone()))
output3 = self.linear3(output2.clone())
# Note the conversion to Pytorch distribution.
distribution = Categorical(F.softmax(output3.clone(), dim=-1))
return distribution
Reinforcement Learning part related code:
class Agent():
def __init__(self, env, lr, gamma):
self.env = env
self.NN = NN(env)
self.lr = lr
self.optim_NN = optim.Adam(self.NN.parameters(), lr = self.lr)
self.gamma = gamma
def update(self, log_probs,returns):
with torch.autograd.set_detect_anomaly(True):
print("updating")
baselines = self.compute_baselines(returns.clone())
loss = self.compute_loss(log_probs.clone(), returns, baselines)
self.optim_NN.zero_grad()
loss.backward()
self.optim_NN.step()
def compute_returns(self,rewards):
R = 0
returns = []
for r in rewards[::-1]:
pre_R = R
R = r + self.gamma*pre_R
returns.insert(0,R)
returns = torch.tensor(returns)
return returns
def compute_baselines(self,returns):
baselines = []
baselines.append(returns[0])
for v in returns:
t = len(baselines)
b = (baselines[t-1]*t + v)/(t+1)
baselines.append(b)
return baselines
def compute_loss(self, log_probs,returns, baselines):
with torch.autograd.set_detect_anomaly(True):
loss = 0
for i in range(0,len(returns)):
l = log_probs[i].clone()
r = returns[i].clone()
b = baselines[i].clone()
pre_loss = loss
loss =pre_loss + (-l*(r-b))
# losses.append(loss)
# losses.append(-log_probs[i].clone()*(returns[i].clone()-baselines[i].clone()))
policy_loss = loss
return policy_loss
I am trying to code my own DQN in Python, using pytorch. I am trying it on the CartPole environment.
Although the Q-loss converaged, the model performed poorly.
Replay buffer was also used in the model with a size of 2000 and the double networks were also used.I updated the target network when the q_net network was updated 100 times.(ps: q_net was used to decide which action to choose.)
I tried to use different network architectures and different combinations of hyper-parameters, but the model still performed poorly, which just kept swaying and never kept itself balanced.
Appreciate for your help, sincerely!!
Here is the figure of Q-loss result:
Q-loss figure
a closer look
Here is the code for action taking.
def take_action(self,state):
if self.learn_count<=10:
self.eposilon = 0.1
else:
self.eposilon = 0.9
decision = np.random.choice([0,1],p = [1-self.eposilon,self.eposilon])
if decision == 1:
#final_decision = self.q_net(state.cuda().detach()).argmax()
final_decision = self.q_net(torch.Tensor(state).to(self.device))
final_decision = torch.max(final_decision, 0)[1].data.numpy()
else:
final_decision = np.random.choice([i for i in range(self.action_space)])
return final_decision.item()
Here is the code for training the network.
def update_nn(self):
if self.learn_count%100 ==0:
self.synchronous_NN()
for i in range(self.num_epoches):
self.learn_count = self.learn_count+1
self.training_nn()
return None
def training_nn(self):
index = random.sample(range(self.replay_buffer.shape[0]),self.minibatch_size)
chosen_sample = self.replay_buffer[index,:]
last_state = copy.deepcopy(chosen_sample[np.isnan(chosen_sample[:,-1:]).squeeze(),:])
not_last_state = copy.deepcopy(chosen_sample[~np.isnan(chosen_sample[:,-1:]).squeeze(),:])
input_not_last_state = torch.FloatTensor(not_last_state[:,:4]).to(self.device)
action_index = torch.LongTensor(not_last_state[:,4].reshape(-1,1)).to(self.device)
action_value = self.q_net(input_not_last_state).gather(1,action_index)
max_action_value = not_last_state[:,5]+self.gamma*self.fixed_q_net(input_not_last_state).detach().max(1).values.numpy()
last_state = np.nan_to_num(last_state)
input_last_state = torch.FloatTensor(last_state[:,:4]).to(self.device)
last_action_index = torch.LongTensor(last_state[:,4].reshape(-1,1)).to(self.device)
last_action_value = self.q_net(input_last_state).gather(1,last_action_index)
last_max_action_value = last_state[:,5]
X = torch.cat([action_value,last_action_value])
y = torch.FloatTensor(np.hstack([max_action_value,last_max_action_value]).reshape(-1,1)).detach()
loss = self.loss(X, y)
self.optimizer.zero_grad() # reset the gradient to zero
loss.backward()
self.optimizer.step() # execute back propagation for one step
self.loss_curve.append(loss)
return None
here is the part of playing:
def start_to_play(self):
agent = Agent()
agent.initial_Q_network(2)
agent.initial_replay_buffer()
self.env = gym.make('CartPole-v0')
self.env = self.env.unwrapped
for i in range(self.episode):
if i%50 ==1:
agent.save_model(i)
step = 0
state = self.env.reset()
ep_r = 0
while(True):
action = agent.take_action(state)
observation, reward, done, info = self.env.step(action)
self.env.render()
#next_state = self.capture_state()
next_state = observation
x, x_dot, theta, theta_dot = observation
r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8
r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5
reward = r1 + r2
ep_r = reward+ep_r
if done:
reward = reward-20
state1_np = np.array(state)
state2_np = np.array([np.nan,np.nan,np.nan,np.nan])
agent.add_replay_buffer(np.hstack([state1_np,np.array([action,reward]),state2_np]))
if agent.replay_buffer.shape[0]>300:
agent.update_nn()
print(i,step,round(ep_r, 2))
break
else:
state1_np = np.array(state)
state2_np = np.array(next_state)
agent.add_replay_buffer(np.hstack([state1_np,np.array([action,reward]),state2_np]))
if agent.replay_buffer.shape[0]>300:
agent.update_nn()
state = next_state
step = step+1
self.plot_curve(agent)
return None
Thanks for your time!!
Loss in reinforcement learning is not very important. In fact, you could see very good agents with good performances but bad results in term of neural network losses.
I suggest you to look into other implementations hyperparameters, since CartPole has been solved an infinite amount of times with multiple algorithms.
The first thing that might be wrong is the experience replay buffer, which is too small. But then again, look into other DQN implementation for CartPole and try to use the same hyperparameters that they are using
I am trying to run a NEAT algorithm using this python implementation. This is the original file from the library that is relevant for my question:
from neat.graphs import feed_forward_layers
class FeedForwardNetwork(object):
def __init__(self, inputs, outputs, node_evals):
self.input_nodes = inputs
self.output_nodes = outputs
self.node_evals = node_evals
self.values = dict((key, 0.0) for key in inputs + outputs)
def activate(self, inputs):
if len(self.input_nodes) != len(inputs):
raise RuntimeError("Expected {0:n} inputs, got {1:n}".format(len(self.input_nodes), len(inputs)))
for k, v in zip(self.input_nodes, inputs):
self.values[k] = v
for node, act_func, agg_func, bias, response, links in self.node_evals:
node_inputs = []
for i, w in links:
node_inputs.append(self.values[i] * w)
s = agg_func(node_inputs)
self.values[node] = act_func(bias + response * s)
return [self.values[i] for i in self.output_nodes]
#staticmethod
def create(genome, config):
""" Receives a genome and returns its phenotype (a FeedForwardNetwork). """
# Gather expressed connections.
connections = [cg.key for cg in genome.connections.values() if cg.enabled]
layers = feed_forward_layers(config.genome_config.input_keys, config.genome_config.output_keys, connections)
node_evals = []
for layer in layers:
for node in layer:
inputs = []
node_expr = [] # currently unused
for conn_key in connections:
inode, onode = conn_key
if onode == node:
cg = genome.connections[conn_key]
inputs.append((inode, cg.weight))
node_expr.append("v[{}] * {:.7e}".format(inode, cg.weight))
ng = genome.nodes[node]
aggregation_function = config.genome_config.aggregation_function_defs.get(ng.aggregation)
activation_function = config.genome_config.activation_defs.get(ng.activation)
node_evals.append((node, activation_function, aggregation_function, ng.bias, ng.response, inputs))
return FeedForwardNetwork(config.genome_config.input_keys, config.genome_config.output_keys, node_evals)
Since I evaluate the performance of my neural networks on a large dataset, I wanted to speed up the activate method using numba jit. In order to not fall back into numbas object mode I had to update the implementation of the activate method (and hence also the fields of the FeedForwardNetwork class) using only datatypes supported by numba. This is what I came up with (create is the same as before):
from neat.graphs import feed_forward_layers
from neat.six_util import itervalues
import numba
from numba import jit, njit
from numba.typed import List, Dict
import numpy as np
import math
#jit(nopython=True)
def activate(input_nodes, output_nodes, node_evals_node, node_evals_bias, node_evals_resp, node_evals_ins_nodes, node_evals_ins_conns, values, inputs):
for i in range(input_nodes.size):
values[input_nodes[i]] = inputs[i]
for node in range(len(node_evals_node)):
s = 0
for pred in range(len(node_evals_ins_nodes[node])):
s += values[node_evals_ins_nodes[node][pred]] * node_evals_ins_conns[node][pred]
values[node_evals_node[node]] = math.tanh(node_evals_bias[node] + node_evals_resp[node] * s)
return [values[output_nodes[i]] for i in range(output_nodes.size)]
class FeedForwardNetwork(object):
def __init__(self, inputs, outputs, node_evals):
self.input_nodes = np.array(inputs)
self.output_nodes = np.array(outputs)
# NODE_EVALS decomposition
self.node_evals_node = np.reshape(np.array(node_evals)[:, 0:1], (len(node_evals),)).astype(np.int64)
self.node_evals_bias = np.reshape(np.array(node_evals)[:, 3:4], (len(node_evals),)).astype(np.float64)
self.node_evals_resp = np.reshape(np.array(node_evals)[:, 4:5], (len(node_evals),)).astype(np.float64)
temp = np.array(node_evals)[:, 5:6]
self.node_evals_ins_nodes = List()
self.node_evals_ins_conns = List()
for node in range(temp.size):
l = List()
m = List()
for predecessor in range(len(temp[node])):
l.append(temp[0][node][predecessor][0])
m.append(temp[0][node][predecessor][1])
self.node_evals_ins_nodes.append(l)
self.node_evals_ins_conns.append(m)
self.values = Dict()
# Set types of dict
self.values[0] = float(1)
self.values.pop(0)
This is the code I call the create and activate method in:
def eval_single_genome(genome, config, thread_id, result):
net = neat.nn.FeedForwardNetwork.create(genome, config)
error_sum = 0
for i, row in PRICES.iterrows():
prediction = feed_forward.activate(net.input_nodes, net.output_nodes, net.node_evals_node, net.node_evals_bias, net.node_evals_resp, net.node_evals_ins_nodes, net.node_evals_ins_conns, net.values, np.array([0]))
error_sum += (prediction - PRICES.iloc[i]['open']) ** 2
result[thread_id] = error_sum
The code compiles and runs without errors or warnings which (as far as I've understood) indicates that numba should be able to optimize my implementation. But adding/removing the #jit(nopython=True)decorator doesn't change the runtime at all.
Did I overlook something? Or is there just nothing that numba can improve in my case?
Here is my LossFunction, when I use this function, it will make this error.
And I have tested that using the nn.L1Loss() instead of my LossFunction, and the network is ok.
what should I do? Thanks for your help!
class LossV1(nn.Module):
def __init__(self,weight=1,pos_weight=1,scale_factor=2.5):
super(LossV1,self).__init__()
self.weight = weight
self.pos_weight = pos_weight
self.scale_factor = scale_factor
def forward(self,pred,truth):
objmask = torch.tensor(truth[:,:6,:,:],dtype=torch.float32,requires_grad=False)
#没有物体的Boxes,置信度损失*0.4
objmask[objmask<0.65] = 0.4
#辅助Boxes,系数0.8
objmask[(objmask>0.649)*(objmask<0.949)] = 0.8
objLoss = torch.sum(objmask*self.myBCEWithLogitsLoss(pred[:,:6,:,:],truth[:,:6,:,:]))
#没有物体的Boxes,只计算置信度损失
objmask[objmask<0.41] = 0
personLoss = torch.sum(objmask*self.myBCEWithLogitsLoss(pred[:,6:12,:,:],truth[:,6:12,:,:]))
carLoss = torch.sum(objmask*self.myBCEWithLogitsLoss(pred[:,12:18,:,:],truth[:,12:18,:,:]))
wLoss = torch.sum(objmask*self.myL2Loss(pred[:,18:24,:,:],truth[:,18:24,:,:]))
hLoss = torch.sum(objmask*self.myL2Loss(pred[:,24:,:,:],truth[:,24:,:,:]))
return objLoss+personLoss+carLoss+wLoss+hLoss
def myBCEWithLogitsLoss(self,x,y):
#pos_weight>1增加召回,pos_weight<1提高精度
return -self.weight*(self.pos_weight*y*torch.log(torch.sigmoid(x))+(1-y)*torch.log(1-torch.sigmoid(x)))
def myL2Loss(self,x,y):
return torch.pow(self.scale_factor*torch.sigmoid(x/self.scale_factor) - y,2)
I just remove the objmask,and compute it in my generation function,then pass it on to LossFunction with truth label,and the network works.
I can't understand I have already done that making the requires_grad=False,why pytroch still compute the gradient.
I'm trying to solve the 'BipedalWalker-v2' problem from Open AI, by using python and Tensorflow. In order to solve it I'm implementing an episodic policy gradient algorithms. Because the 'BipedalWalker-v2' actions are continuous my policy is approximated by a multivariate Gaussian distribution. The mean of this distribution is approximated using a fully connected neural network. My neural network has the following layers: [input:24,hidden:5,hidden:5,output:4]. My problem is that when I train the agent, the training process gets slower and slower until it almost freeze. My guess is that I'm misusing sess.run, I'm not feeding the batches in an efficient way. But is just a guess. My question is: Is my guess correct? if it is correct, how can I improve it? and if it is something else, what it is? I'm not looking for a literal solution I just want to get some lights about how to improve the training.
Thanks in advance,
my computer is a Inspiron 15 7000 Gaming, GeForce nvidia gtx 1050, 8 gb ram,cpu: I5
My CODE:
Libraries:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
Agent class:
class agent_episodic_continuous_action():
def __init__(self, lr, s_size,a_size,batch_size,dist_type):
self.stuck = False
self.gamma = 0.99
self.dist_type = dist_type
self.is_brain_present = False
self.s_size = s_size
self.batch_size=batch_size
self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
self.a_size=a_size
self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
self.cov = tf.eye(a_size)
self.reduction = 0.01
if a_size > 1:
self.action_holder = tf.placeholder(shape=[None,a_size],dtype=tf.float32)
else:
self.action_holder = tf.placeholder(shape=[None],dtype=tf.float32)
self.gradient_holders = []
self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
def save_model(self,path,sess):
self.saver.save(sess, path)
def load_model(self,path,sess):
self.saver.restore(sess, path)
def create_brain(self,hidd_layer,hidd_layer_act_fn,output_act_fn):
self.is_brain_present = True
hidden_output=slim.stack(self.state_in,slim.fully_connected,hidd_layer,activation_fn=hidd_layer_act_fn)
self.output = slim.fully_connected(hidden_output,self.a_size,activation_fn=output_act_fn,biases_initializer=None)
def create_pi_dist(self):
if self.dist_type == "normal":
# amplify= tf.pow(slim.fully_connected(self.output,1,activation_fn=None,biases_initializer=None),2)
mean= self.output
#cov =tf.eye(self.a_size,batch_shape=[self.batch_size])*amplify
normal = tf.contrib.distributions.MultivariateNormalFullCovariance(
loc=mean,
covariance_matrix=self.cov*self.reduction)
self.dist = normal
def create_loss(self):
self.loss = -tf.reduce_mean(tf.log(self.dist.prob(self.action_holder))*self.reward_holder)
def get_gradients_holder(self):
for idx,var in enumerate(self.tvars):
placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
self.gradient_holders.append(placeholder)
def sample_action(self,sess,state):
sample_action= sess.run(self.dist.sample(),feed_dict={self.state_in:state})
return sample_action
def calculate_loss_gradient(self):
self.gradients = tf.gradients(self.loss,self.tvars)
def update_weights(self):
self.update_batch = self.optimizer.apply_gradients(zip(self.gradients,self.tvars))
return self.update_batch
def memorize_data(self,episode,first):
if first:
self.episode_history = episode
self.stuck = False
else:
self.episode_history = np.vstack((self.episode_history,episode))
def shuffle_memories(self):
np.random.shuffle(self.episode_history)
def create_graph_connections(self):
if self.is_brain_present:
self.create_pi_dist()
self.create_loss()
self.tvars = tf.trainable_variables()
self.calculate_loss_gradient()
self.saver = tf.train.Saver()
self.update_weights()
else:
print("initialize brain first")
self.init = tf.global_variables_initializer()
def memory_batch_generator(self):
total=self.episode_history.shape[0]
amount_of_batches= int(total/self.batch_size)
for i in range(amount_of_batches+1):
if i < amount_of_batches:
top=(i+1)*self.batch_size
bottom =i*self.batch_size
yield (self.episode_history[bottom:top,0:self.s_size],self.episode_history[bottom:top,self.s_size:self.s_size+self.a_size],self.episode_history[bottom:top,self.s_size+self.a_size:self.s_size+self.a_size+1],self.episode_history[bottom:top,self.s_size+self.a_size+1:])
else:
yield (self.episode_history[top:,0:self.s_size],self.episode_history[top:,self.s_size:self.s_size+self.a_size],self.episode_history[top:,self.s_size+self.a_size:self.s_size+self.a_size+1],self.episode_history[top:,self.s_size+self.a_size+1:])
def train_with_current_memories(self,sess):
self.sess = sess
for step_sample_batch in self.memory_batch_generator():
sess.run(self.update_weights(), feed_dict={self.state_in:step_sample_batch[0],self.action_holder:step_sample_batch[1],self.reward_holder:step_sample_batch[2].reshape([step_sample_batch[2].shape[0]])})
def get_returns(self):
self.episode_history[:,self.s_size+self.a_size:self.s_size+self.a_size+1] = self.discount_rewards(self.episode_history[:,self.s_size+self.a_size:self.s_size+self.a_size+1])
def discount_rewards(self,r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(range(0, r.size)):
running_add = running_add * self.gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def prob_action(self,sess,action,state):
prob = sess.run(self.dist.prob(action),feed_dict={self.state_in:state})
return prob
def check_movement(self):
ep_back = 5
jump = 3
threshold = 3
if len(self.episode_history) > ep_back*2:
difference = sum(abs(self.episode_history[-ep_back:-1,:]-self.episode_history[-ep_back-jump:-1-jump,:]).flatten())
print(difference)
if difference < threshold:
self.stuck = True
def print_last_n_returns(self,n):
if len(self.episode_history[:,self.s_size+self.a_size:self.s_size+self.a_size+1])>n:
n_returns = sum(self.episode_history[-n:,self.s_size+self.a_size:self.s_size+self.a_size+1])/float(n)
print(n_returns)
return n_returns
Training loops:
tf.reset_default_graph()
agent_2= agent_episodic_continuous_action(1e-2,s_size=24,a_size=4,batch_size=30,dist_type="normal")
agent_2.create_brain([5,5],tf.nn.relu,None)
agent_2.create_graph_connections()
env = gym.make('BipedalWalker-v2')
with tf.Session() as sess:
sess.run(agent_2.init)
for i in range(200):
s = env.reset()
d = False
a=agent_2.sample_action(sess,[s])[0]
print(a)
if None in a:
print("None in a! inside for")
print(s)
s1,r,d,_ = env.step(a)
episode = np.hstack((s,a,r,s1))
agent_2.memorize_data(episode=episode,first=True)
count = 0
while not d:
count = count + 1
s = s1
a=agent_2.sample_action(sess,[s])[0]
s1,r,d,_ = env.step(a)
episode = np.hstack((s,a,r,s1))
# env.render()
agent_2.memorize_data(episode=episode,first=False)
# print(s1)
if count % 5 == 0 :
agent_2.check_movement()
if agent_2.stuck:
d = True
agent_2.get_returns()
agent_2.print_last_n_returns(20)
agent_2.shuffle_memories()
agent_2.train_with_current_memories(sess)
env.close()
For each batch of 30 samples I execute Agent.update_weights()
def update_weights(self):
self.update_batch = self.optimizer.apply_gradients(zip(self.gradients,self.tvars))
When I execute:
def train_with_current_memories(self,sess):
self.sess = sess
for step_sample_batch in self.memory_batch_generator():
sess.run(self.update_weights(), feed_dict={self.state_in:step_sample_batch[0],self.action_holder:step_sample_batch[1],self.reward_holder:step_sample_batch[2].reshape([step_sample_batch[2].shape[0]])})
Or maybe this sluggishness is an expected behavior.
The code was slowing down after each iteration because the graph was getting bigger at each iteration. This is because I was creating new graph elements inside the iteration loop.
during each iteration the following function was being called:
def update_weights(self):
self.update_batch = self.optimizer.apply_gradients(zip(self.gradients,self.tvars))
return self.update_batch
This function was creating a new element to the graph.
The best way to avoid "graph leaking" is to add the line
sess.graph.finalize()
as soon as you create your session. In this way, if there is a graph leaking, Tensorflow will raise an exception.