Can keeping channels more than '3' in images crash CNNs? - python

I have an encoder model which was working fine with single channel 1024,1024 images, I'm trying to patch the original images (mega pixel images) to 256, 256, 64 images. I've changed my encoder input to match the images input that the model will get. The model call function is working fine, loss is getting calculated fine, but I'm getting the following error with tape.gradient:
2023-01-29 17:11:01.868555: F tensorflow/stream_executor/cuda/] Check failed: cudnnSetTensorNdDescriptor(handle_.get(), elem_type, nd,, == CUDNN_STATUS_SUCCESS (3 vs. 0)batch_descriptor: {count: 10 feature_map_count: 64 spatial: 0 0 value_min: 0.000000 value_max: 0.000000 layout: BatchYXDepth} C:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\backend\ UserWarning: resource_tracker: There appear to be 2 leaked folder objects to clean up at shutdown warnings.warn('resource_tracker: There appear to be %d ' C:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\backend\ UserWarning: resource_tracker: C:\Users\kjhan\AppData\Local\Temp\joblib_memmapping_folder_12248_772bbeeeccff43089fa0e6d75271eebd_97f2f7c6edd04b468a4360bf96b91b84: FileNotFoundError(2, 'The system cannot find the path specified') warnings.warn('resource_tracker: %s: %r' % (name, e)) C:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\backend\ UserWarning: resource_tracker: C:\Users\kjhan\AppData\Local\Temp\joblib_memmapping_folder_12248_29db2f1e8ff54416b9a78c6f69dcff23_40a85063390f46d38d15c1877f99acc8: FileNotFoundError(2, 'The system cannot find the path specified') warnings.warn('resource_tracker: %s: %r' % (name, e)) [I 17:11:10.131 NotebookApp] KernelRestarter: restarting kernel (1/5), keep random ports kernel 286d1cc6-8ddd-46f9-baf7-5e1b05a2d033 restarted
My code is as below
class encoder(tf.keras.layers.Layer):
def __init__(self,size:tuple):
super(encoder, self).__init__()
#encoder Module
self.input_cnn = keras.layers.InputLayer(input_shape=(size[0],size[1],size[2]))
# Ex0panding features for computation
self.conv_1 = keras.layers.Conv2D(input_shape=(size[0],size[1],size[2]),filters=16,kernel_size=(3,3),padding='same',activation='relu')
# 1/4 size reduction
self.conv_2 = keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2))
self.conv_3 = keras.layers.Conv2D(filters = 16,kernel_size=(4,4),strides=(2,2),padding='same',activation='relu')
self.conv_4 = keras.layers.Conv2D(filters = 32,kernel_size=(4,4),strides=(4,4),padding='same',activation='relu')
self.conv_5 = keras.layers.BatchNormalization()
# 1/2 size reduction
self.conv_6 = keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2))
# 3x3 Filter third application
self.conv_7 = keras.layers.Conv2D(filters = 64,kernel_size=(8,8),strides=(8,8),padding='same',activation='relu')
# 1/4 size reduction
self.conv_8 = keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2))
# 3x3 Filter third application
self.conv_9 = keras.layers.BatchNormalization()
self.conv_10 = keras.layers.Conv2D(filters = 1 ,kernel_size=(3,3),strides=(1,1),padding='same',activation='relu')
def call(self,inputs,training = True):
x = self.input_cnn(inputs)
x = self.conv_1(x)
x = self.conv_2(x)
x = self.conv_3(x)
x = self.conv_4(x)
if training == True:
x = self.conv_5(x,training = True)
x = self.conv_5(x,training = False)
x = self.conv_6(x)
x = self.conv_7(x)
x = self.conv_8(x)
if training == True:
x = self.conv_9(x,training = True)
x = self.conv_9(x,training = False)
x = self.conv_10(x)
return x
size 0 is 256
size 1 is 256
size 2 is 64
Train_step from main model:
def __init__(self, size: tuple, optimizer = keras.optimizers.Adam(learning_rate=1e-3),loss_fn = keras.losses.BinaryCrossentropy(from_logits=False),metric = tf.keras.metrics.Accuracy()):
super(BCDClassifier, self).__init__()
#Input for catagorical data
self.input_cat = keras.layers.InputLayer(input_shape = (2,))
#Encoder Layer for each view
self.encode = encoder(size)
#flatten encoded output
self.flatten = keras.layers.Flatten()
#Concatenate Layer
self.concat = keras.layers.Concatenate(axis = 1)
#Classifier layer
self.classify = classifier(32)
#deffine model parameters
self.optimizer = optimizer
self.loss_fn = loss_fn
self.loss_tracker = keras.metrics.Mean(name="loss")
self.acc_tracker = metric
self.f1_tracker = tfa.metrics.F1Score(num_classes=2, threshold=0.5, average = 'micro')
self.sk_metric_acc = accuracy_score
self.sk_metric_f1 = f1_score
self.acc_history = []
self.loss_history = []
self.f1_history = []
# Forward pass of model - order does matter.
def call(self, cat_batch, view_batch, images_batch, training = True):
x1 = self.encode(images_batch,training)
x2 = self.input_cat(cat_batch)
x1 = self.flatten(x1)
x12 = self.concat([x1,x2])
x12 = self.classify(x12)
return x12
def train_step(self,cat_batch, views_batch, images_batch, target_batch, training = True):
with tf.GradientTape() as tape:
logits = self(cat_batch, views_batch, images_batch,training)
loss_value = self.loss_fn(target_batch, logits)
grads = tape.gradient(loss_value, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
pred = []
target = []
threshold = 0.5
for val in logits.numpy():
if isinstance(val,np.ndarray):
for v_1 in val:
if isinstance(v_1,np.ndarray):
for v_2 in v_1:
if v_2 > threshold:
if v_1 > threshold:
if val > threshold:
for val in target_batch:
if isinstance(val,np.ndarray):
for v_1 in val:
if isinstance(v_1,np.ndarray):
for v_2 in v_1:
acc = self.sk_metric_acc(target,pred)
f1 = self.sk_metric_f1(target,pred)
return {"Loss": self.loss_tracker.result(), "Accuracy": acc, 'F1-score':f1}


for loop sending wrong data to list

Below is the code, i am running a for loop to train on different training sizes. The first loop works correctly, where when training begins, the training and validation accuracy are sent to a list, then a frame then finally a csv. But on the subsequent loops, a data generator is sent to the list. Can anyone see where the issue is, because I cant find it.
Also if you have a better way of doing this (data compiling for analysis), I'm all ears.
The first block is the code snippet, the second block is the full code. The for loop starts about halfway down.
for i in range(1,6):
training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros = switcher().sets(case)
train_accuracy = []
val_accuracy = []
start_time = time.time()
for epoch in tqdm(range(1, epochs + 1), total=epochs):
train_acc = test(training_loader)
val_acc = test(validation_loader)
accuracy = pd.DataFrame()
accuracy['train_acc'] = train_accuracy
accuracy['val_acc'] = val_accuracy
accuracy.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\accuracy_{i}.csv')
import sys
sys.path.insert(0, 'C:\\Users\\user\\Desktop\\imbalance_exp\\imbalance_exp\\imbalance_exp')
import torch
from torch_geometric.loader import DataLoader
import imb_dataset as imb
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GraphConv
from torch_geometric.nn import global_mean_pool
import as neptune
import pandas as pd
from sklearn.metrics import confusion_matrix, matthews_corrcoef
import seaborn as sns
from import File
from tqdm import tqdm
import time
known = imb.ImbalanceDataset(root='imb_50v2', set='known', split=0.5)
unknown = imb.ImbalanceDataset(root='imb_50v2', set='unknown', split=0.5)
all_data = imb.ImbalanceDataset(root='imb_50v2', set='All', split=None)
known = known.shuffle()
lr = 0.001
training_perc = 0.9
N = len(known)
mini_batch_size = 32
epochs = 600
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
case = 2
class switcher:
def sets(self, case):
default = known
return getattr(self, 'case_' + str(case), lambda: default)()
def case_1(self):
training_set = known[:int(training_perc*len(known))]
validation_set = known[int(training_perc*len(known)):]
training_loader = DataLoader(training_set, batch_size=mini_batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=mini_batch_size, shuffle=False)
training_ones = []
training_zeros = []
validation_ones = []
validation_zeros = []
for i in range(len(training_set)):
if training_set[i].y == 1:
for i in range(len(validation_set)):
if validation_set[i].y == 1:
return training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros
def case_2(self):
one_index = round(len(known) * 0.25)
known_ones = known[:one_index].copy()
known_zeros = known[one_index:].copy()
training_ones = known_ones[:int(training_perc*len(known_ones))]
training_zeros = known_zeros[:len(training_ones)]
training_set =[training_ones, training_zeros])
validation_ones = known_ones[int(training_perc*len(known_ones)):]
validation_zeros = known_zeros[len(training_ones):]
validation_set =[validation_ones, validation_zeros])
training_loader = DataLoader(training_set, batch_size=mini_batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=mini_batch_size, shuffle=False)
training_ones = []
training_zeros = []
validation_ones = []
validation_zeros = []
for i in range(len(training_set)):
if training_set[i].y == 1:
for i in range(len(validation_set)):
if validation_set[i].y == 1:
return training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super(GCN, self).__init__()
self.conv1 = GraphConv(known.num_node_features, hidden_channels)
self.conv2 = GraphConv(hidden_channels, hidden_channels)
self.conv3 = GraphConv(hidden_channels, hidden_channels)
self.lin = Linear(hidden_channels, known.num_classes)
def forward(self, x, edge_index, batch):
# 1. Obtain node embeddings
x = self.conv1(x, edge_index)
x = x.relu()
x = self.conv2(x, edge_index)
x = x.relu()
x = self.conv3(x, edge_index)
# 2. Readout layer
x = global_mean_pool(x, batch) # [batch_size, hidden_channels]
# 3. Apply a final classifier
x = F.dropout(x, p=0.5,
x = self.lin(x)
return x
model = GCN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
def train():
total_loss = 0
for data in training_loader: # Iterate in batches over the training dataset.
data =
out = model(data.x, data.edge_index, data.batch) # Perform a single forward pass.
loss = criterion(out, data.y) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients.
def test(loader):
correct = 0
for data in loader: # Iterate in batches over the training/test dataset.
data =
out = model(data.x, data.edge_index, data.batch)
pred = out.argmax(dim=1) # Use the class with highest probability.
correct += int((pred == data.y).sum()) # Check against ground-truth labels.
return correct / len(loader.dataset) # Derive ratio of correct predictions.
output_frame = pd.DataFrame(columns=['epoch', 'lr', 'known', 'unknown', 'train_ones', 'train_zeros', 'val_ones', 'val_zeros', 'tn_all', 'fp_all', 'fn_all', 'tp_all', 'tn_known', 'fp_known', 'fn_known', 'tp_known', 'precision_all', 'recall_all', 'f1_all', 'accuracy_all', 'mcc_all', 'precision_known', 'recall_known', 'f1_known', 'accuracy_known', 'mcc_known', 'time_elapsed'])
for i in range(1,6):
training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros = switcher().sets(case)
train_accuracy = []
val_accuracy = []
start_time = time.time()
for epoch in tqdm(range(1, epochs + 1), total=epochs):
train_acc = test(training_loader)
val_acc = test(validation_loader)
accuracy = pd.DataFrame()
accuracy['train_acc'] = train_accuracy
accuracy['val_acc'] = val_accuracy
accuracy.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\accuracy_{i}.csv')
unknown_loader = DataLoader(unknown, batch_size=1, shuffle=False)
predictions = []
all_correct = 0
known_correct = 0
for test in unknown_loader:
test =
out = model(test.x, test.edge_index, test.batch)
pred = out.argmax(dim=1)
all_correct += int((pred == test.y_all).sum())
known_correct += int((pred == test.y_known).sum())
pred_df = pd.DataFrame()
pred_df['y_all_true'] = [i.item() for i in]
pred_df['y_known_true'] = [i.item() for i in]
pred_df['y_pred'] = [i.item() for i in predictions]
pred_df.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\pred_df_{i}.csv')
cf_matrix_all = confusion_matrix(pred_df['y_all_true'], pred_df['y_pred'])
ax = sns.heatmap(cf_matrix_all, annot=True, fmt='g', cmap='Blues')
ax.title.set_text('Confusion Matrix based on all data')
tn_all, fp_all, fn_all, tp_all = cf_matrix_all.ravel()
end_time = time.time()
time_elapsed = end_time - start_time
precision_all = tp_all / (tp_all + fp_all)
recall_all = tp_all / (tp_all + fn_all)
f1_all = 2 * (precision_all * recall_all) / (precision_all + recall_all)
accuracy_all = (tp_all + tn_all) / (tp_all + tn_all + fp_all + fn_all)
mcc_all = matthews_corrcoef(pred_df['y_all_true'], pred_df['y_pred'])
cf_matrix_known = confusion_matrix(pred_df['y_known_true'], pred_df['y_pred'])
ax = sns.heatmap(cf_matrix_known, annot=True, fmt='g', cmap='Blues')
ax.title.set_text('Confusion Matrix based on known data')
tn_known, fp_known, fn_known, tp_known = cf_matrix_known.ravel()
precision_known = tp_known / (tp_known + fp_known)
recall_known = tp_known / (tp_known + fn_known)
f1_known = 2 * (precision_known * recall_known) / (precision_known + recall_known)
accuracy_known = (tp_known + tn_known) / (tp_known + tn_known + fp_known + fn_known)
mcc_known = matthews_corrcoef(pred_df['y_known_true'], pred_df['y_pred'])
#'epoch', 'lr', 'known', 'unknown', 'train_ones', 'train_zeros', 'val_ones', 'val_zeros', 'tn_all', 'fp_all', 'fn_all', 'tp_all', 'tn_known', 'fp_known', 'fn_known', 'tp_known
output_frame.loc[i] = [epochs, lr, len(known), len(unknown), len(training_ones), len(training_zeros), len(validation_ones), len(validation_zeros), tn_all, fp_all, fn_all, tp_all, tn_known, fp_known, fn_known, tp_known, precision_all, recall_all, f1_all, accuracy_all, mcc_all, precision_known, recall_known, f1_known, accuracy_known, mcc_known, time_elapsed]
output_frame.to_csv('C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\final_output.csv')
training_perc -= 0.2

ValueError: operands could not be broadcast together with shapes (1,1,1500) (1,512)

For my project I am trying to compare between LSTM and GRU for video captioning. I started with an open source LSTM code. And after making changes for the GRU the model trains perfectly. But I am getting the mentioned value error when I am trying to test. So I am attaching the code of testing below
###Testing Block###
# class to perform inference on all test files and save as test_output.txt
class Video2Text(object):
''' Initialize the parameters for the model '''
def __init__(self):
self.latent_dim = 512
self.num_encoder_tokens = 4096
self.num_decoder_tokens = 1500
self.time_steps_encoder = 80
self.time_steps_decoder = None
self.preload = True
self.preload_data_path = 'preload_data'
self.max_probability = -1
# processed data
self.encoder_input_data = []
self.decoder_input_data = []
self.decoder_target_data = []
self.tokenizer = None
# models
self.encoder_model = None
self.decoder_model = None
self.inf_encoder_model = None
self.inf_decoder_model = None
self.save_model_path = 'model_final'
self.test_path = 'testing_data'
def load_inference_models(self):
# load tokenizer
with open(os.path.join(self.save_model_path, 'tokenizer' + str(self.num_decoder_tokens)), 'rb') as file:
self.tokenizer = joblib.load(file)
# inference encoder model
self.inf_encoder_model = load_model(os.path.join(self.save_model_path, 'encoder_model.h5'))
# inference decoder model
decoder_inputs = Input(shape=(None, self.num_decoder_tokens))
decoder_dense = Dense(self.num_decoder_tokens, activation='softmax')
decoder_gru = GRU(self.latent_dim, return_sequences=True, return_state=True)
decoder_state_input_h = Input(shape=(self.latent_dim,))
decoder_state_input_c = Input(shape=(self.latent_dim,))
#decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]
decoder_outputs, state_h = decoder_gru(decoder_inputs, initial_state=decoder_state_input_h)
decoder_states = [state_h]
decoder_outputs = decoder_dense(decoder_outputs)
self.inf_decoder_model = Model(
[decoder_inputs] + decoder_state_input,
[decoder_outputs] + decoder_states)
self.inf_decoder_model.load_weights(os.path.join(self.save_model_path, 'decoder_model_weights.h5'))
def decode_sequence2bs(self, input_seq):
states_value = self.inf_encoder_model.predict(input_seq)
target_seq = np.zeros((1, self.num_decoder_tokens))
target_seq[0, self.tokenizer.word_index['bos']] = 1
caption = self.greedy_search(input_seq)
return caption
def greedy_search(self, f):
:param f: the loaded numpy array after creating videos to frames and extracting features
:return: the final sentence which has been predicted greedily
inv_map = self.index_to_word()
states_value = self.inf_encoder_model.predict(f.reshape(-1, 80, 4096))
target_seq = np.zeros((1, 1500))
final_sentence = ''
target_seq[0, self.tokenizer.word_index['bos']] = 1
for i in range(15):
output_tokens, h = self.inf_decoder_model.predict([target_seq] + states_value)
states_value = [h]
output_tokens = output_tokens.reshape(self.num_decoder_tokens)
y_hat = np.argmax(output_tokens)
if y_hat == 0:
if inv_map[y_hat] is None:
if inv_map[y_hat] == 'eos':
final_sentence = final_sentence + inv_map[y_hat] + ' '
target_seq = np.zeros((1, 1500))
target_seq[0, y_hat] = 1
return final_sentence
def decoded_sentence_tuning(self, decoded_sentence):
decode_str = []
filter_string = ['bos', 'eos']
unigram = {}
last_string = ""
for idx2, c in enumerate(decoded_sentence):
if c in unigram:
unigram[c] += 1
unigram[c] = 1
if(last_string == c and idx2 > 0):
if c in filter_string:
if len(c) > 0:
if idx2 > 0:
last_string = c
return decode_str
def index_to_word(self):
# inverts word tokenizer
index_to_word = {value: key for key, value in self.tokenizer.word_index.items()}
return index_to_word
def get_test_data(self, path):
X_test = []
X_test_filename = []
%cd /content/drive/My\ Drive/Video-Captioning-main/data/
with open (os.path.join(path, 'testing_id.txt')) as testing_file:
lines = testing_file.readlines()
for filename in lines:
filename = filename.strip()
f = np.load(os.path.join(path , 'feat', filename + '.npy'))
X_test = np.array(X_test)
return X_test, X_test_filename
def test(self):
X_test, X_test_filename = self.get_test_data(os.path.join(self.test_path))
print(len(X_test), len(X_test_filename))
# generate inference test outputs
%cd /content/drive/My\ Drive/Video-Captioning-main/model_final/
with open(os.path.join(self.save_model_path, 'test_output_greedy.txt'), 'w') as file:
for idx, x in enumerate(X_test):
decoded_sentence = self.decode_sequence2bs(x.reshape(-1, 80, 4096))
file.write(decoded_sentence + ' ')
# re-init max prob
#self.max_probability = -1
And the code for building the GRU. In here you can see that encoder_gru outputs one state in addition to encoder_out. But in LSTM it provided 2 additional outputs. So I think this is finally causing the error. Because then in lstm I got (1,1,1500) and (2,1,512).
# Setting up the encoder
encoder_inputs = Input(shape=(time_steps_encoder, num_encoder_tokens), name="encoder_inputs")
encoder_gru = GRU(latent_dim, return_state=True,return_sequences=True, name='endcoder_gru')
encoder_out, encoder_state = encoder_gru(encoder_inputs)
# Set up the decoder
decoder_inputs = Input(shape=(time_steps_decoder, num_decoder_tokens), name= "decoder_inputs")
decoder_gru = GRU(latent_dim, return_sequences=True, return_state=True, name='decoder_gru')
decoder_outputs, decoder_state = decoder_gru(decoder_inputs, initial_state=encoder_state)
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_relu')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
plot_model(model, to_file='model_train.png', show_shapes=True, show_layer_names=True)
And following is the exact line where I am getting the error:
<ipython-input-66-ebbb2fcfd630> in greedy_search(self, f)
78 print(target_seq.shape)
79 for i in range(15):
---> 80 output_tokens, h = self.inf_decoder_model.predict([target_seq] + states_value)
81 states_value = [h]
82 output_tokens = output_tokens.reshape(self.num_decoder_tokens)
ValueError: operands could not be broadcast together with shapes (1,1,1500) (1,512)
In testing block I tried with making the shapes (1,1500) and (1,512), but even then it gives the same error.

Adding neurons to Adam optimizer state in Pytorch

I've posted the following to the Pytorch discussion board too. I'm trying to keep the learning-rates per parameter for the already existing parameters when adding more neurons (to existing layers, not new layers) to a network. I’ve written the following class which allows me to add neurons to hidden layers during training:
import torch
import torch.nn as nn
class DQN(nn.Module):
def __init__(self, num_inputs, hidden, num_actions, non_linearity):
super(DQN, self).__init__()
self.num_inputs = num_inputs
self.hidden = hidden
self.num_actions = num_actions
self.non_linearity = non_linearity
self.layers = nn.ModuleList()
self.layers.append(nn.Linear(num_inputs, self.hidden[0]))
previous = self.hidden[0]
for hidden_layer_size in self.hidden[1:]:
self.layers.append(nn.Linear(previous, hidden_layer_size))
previous = hidden_layer_size
self.layers.append(nn.Linear(previous, num_actions))
def forward(self, x):
for i in range(len(self.layers) - 1):
x = self.non_linearity(self.layers[i](x))
return self.layers[-1](x)
def increase_capacity(self, increment):
for i in range(len(self.hidden)):
self.hidden[i] += increment[i]
bias = self.layers[0]
weight = self.layers[0]
self.layers[0] = nn.Linear(self.num_inputs, self.hidden[0])
if increment[0]>0:
self.layers[0][0:-increment[0],:] = weight
self.layers[0][0:-increment[0]] = bias
self.layers[0][0:,:] = weight
self.layers[0] = bias
for i in range(1, len(self.layers) - 1):
bias = self.layers[i]
weight = self.layers[i]
self.layers[i] = nn.Linear(self.hidden[i-1], self.hidden[i])
if increment[i] > 0:
if increment[i-1] >0:
self.layers[i][0:-increment[i]] = bias
self.layers[i][0:-increment[i],0:-increment[i-1]] = weight
self.layers[i][0:-increment[i]] = bias
self.layers[i][0:-increment[i],0:] = weight
if increment[i-1] >0:
self.layers[i] = bias
self.layers[i][0:,0:-increment[i-1]] = weight
self.layers[i] = bias
self.layers[i][0:,0:] = weight
bias = self.layers[-1]
weight = self.layers[-1]
self.layers[-1] = nn.Linear(self.hidden[-1], self.num_actions)
if increment[-1] >0:
self.layers[-1] = bias
self.layers[-1][:,0:-increment[-1]] = weight
self.layers[-1] = bias
self.layers[-1][:,0:] = weight
def act(self, state, epsilon, mask):
if np.random.rand() > epsilon:
state = torch.tensor([state], dtype=torch.float32, device=device)
mask = torch.tensor([mask], dtype=torch.float32, device=device)
q_values = self.forward(state) + mask
action = q_values.max(1)[1].view(1, 1).item()
action = np.random.randint(self.num_actions)
return action
Now I’ve written a little sanity check (whether it leads to sanity is questionable at this point): a network with 2 layers with both 1 neuron should fail to learn the x-or function, whereas a network where 4 neurons have been added should. If I initialise a new optimiser this indeed works. The optimiser I use is Adam, which keeps track of learning-rates per parameter. I’d like to keep the learning-rates of Adam for the weights and biases that already existed before I add additional neurons. The following is my failed attempt to doing so:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
# Credits to Alvations
def generate_zero():
return random.uniform(0, 49) / 100
def generate_one():
return random.uniform(50, 100) / 100
def generate_xor_XY(num_data_points):
Xs, Ys = [], []
for _ in range(num_data_points):
# xor(0, 0) -> 0
Xs.append([generate_zero(), generate_zero()]); Ys.append([0])
# xor(1, 0) -> 1
Xs.append([generate_one(), generate_zero()]); Ys.append([1])
# xor(0, 1) -> 1
Xs.append([generate_zero(), generate_one()]); Ys.append([1])
# xor(1, 1) -> 0
Xs.append([generate_one(), generate_one()]); Ys.append([0])
return Xs, Ys
# Initialisation
network = DQN(2,[1,1],1,F.relu)
# optimizer = optim.Adam(network.parameters(), amsgrad=False)
optimizer = optim.Adam(network.parameters(), amsgrad=True)
criterion = nn.MSELoss()
# Train 50000 steps to show 1 neuron cannot solve x-or task
for i in range(50000):
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
# Add 5 neurons to first layer
capacity = [4,4]
# Uncomment the following line and comment the lines following it for normal initialisation.
# optimizer = optim.Adam(network.parameters(), amsgrad=True)
nw_param = [p for p in network.parameters()]
new_param_group = []
layer_idx = 0
for idx, group in enumerate(optimizer.param_groups):
for idx_p, p in enumerate(group['params']):
# Save previous information
prev_grad = p.grad
old_p = copy.deepcopy(p)
old_state = copy.copy(optimizer.state[p])
old_step = old_state['step']
old_exp_avg = old_state['exp_avg']
old_exp_avg_sq = old_state['exp_avg_sq']
old_max_exp_avg_sq = old_state['max_exp_avg_sq']
# Remove old parameter from state
# Weights
if p.dim()>1:
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.torch.zeros_like(p)
new_exp_avg_sq = torch.torch.zeros_like(p)
new_max_exp_avg_sq = torch.torch.zeros_like(p)
p.grad[0:prev_grad.size(0),0:prev_grad.size(1)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0),0:prev_grad.size(1)] = old_max_exp_avg_sq
# Biases
p = nn.Parameter(nw_param[layer_idx])
p.grad = torch.zeros_like(p)
new_exp_avg = torch.zeros_like(p)
new_exp_avg_sq = torch.zeros_like(p)
new_max_exp_avg_sq = torch.zeros_like(p)
p.grad[0:prev_grad.size(0)] = prev_grad
optimizer.state[p]['step'] = old_step
optimizer.state[p]['exp_avg'] = new_exp_avg
optimizer.state[p]['exp_avg'][0:prev_grad.size(0)] = old_exp_avg
optimizer.state[p]['exp_avg_sq'] = new_exp_avg_sq
optimizer.state[p]['exp_avg_sq'][0:prev_grad.size(0)] = old_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'] = new_max_exp_avg_sq
optimizer.state[p]['max_exp_avg_sq'][0:prev_grad.size(0)] = old_max_exp_avg_sq
layer_idx += 1
optimizer.param_groups[0]['params'] = new_param_group
# Train 50000 steps to show by adding neurons the task can be solved
for i in range(50000):
Xs, Ys = generate_xor_XY(1)
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys, dtype=torch.float)
prediction = network(Xs)
loss = criterion(prediction, Ys)
print(network(torch.tensor([[1,0],[0,1],[1,1],[0,0]], dtype=torch.float)))
I’m trying to get the same optimizer state, but with additional parameters for the added neurons. This seems like a convoluted way of doing it (and it doesn’t work:p). Does anyone know of an (easier) way to do this or see where I’m going wrong?

DQN TensorFlow code runs out of memory very quickly

I am trying to train a turtle bot simulation using a DQN. Turtle bot is supposed to find a target in a maze. It is fairly simple and it is converging. My problem is that after a couple of runs the training will get extremely slow. It is fast at the beginning but it gets very slow after 50ish runs. I have check the problem, my CPU is not even used 50% but my memory is eaten up and about 98% of memory is occupied. Somewhere in my code I am leaking memory and I think t is in the initialization of my DQN agent. Can you please guide me on what is the problem and how can I fix it.
Thanks a lot.
Here is the training code which is based on DQN with priority buffer:
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import gym_gazebo
import tensorflow as tf
import numpy as np
import time
import random
from random import *
import cv2
from gym import wrappers
from skimage import transform
import datetime
import liveplot
from dqn_agent_withTarget import DQNAgent
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
def render():
render_skip = 0 #Skip first X episodes.
render_interval = 50 #Show render Every Y episodes.
render_episodes = 10 #Show Z episodes every rendering.
if (x%render_interval == 0) and (x != 0) and (x > render_skip):
elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
if __name__ == '__main__':
env = gym.make('GazeboCircuit2TurtlebotLidar-v0')
outdir = '/tmp/gazebo_gym_experiments'
env = gym.wrappers.Monitor(env, outdir, force=True)
plotter = liveplot.LivePlot(outdir)
last_time_steps = np.ndarray(0)
start_time = time.time()
total_episodes = 1000
max_steps = 200
highest_reward = 0
gamma = 0.95
num_actions = 3
action_space = [0,1,2]
tf.reset_default_graph() # Reset training graph
myinit = tf.global_variables_initializer()# Initialize training network
agent = DQNAgent(action_space,"GazeboCircuit2TurtlebotLidar-v0")
agent.exploration = 1
cv2.namedWindow("window", 1)
x_val = np.random.rand(4096,256).astype(np.float32)
agent.W_fc1.load(x_val, session=agent.sess)
for e in range(total_episodes):
# reset
linecount = 0
terminal= False
win = 0
frame = 0
loss = 0.0
Q_max = 0.0
steps = 0
reward_t= 0.0
cumulated_rewards = 0
agent.exploration *= 0.9
if agent.exploration<0.1:
_, reward, terminal, info = env.step(0)
linecount += 1
print( "Time %s, %s" %(linecount,
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
linecount += 1
print( "Time %s, %s" %(linecount,
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
while (not terminal):
steps += 1
state_t = state_t_1
# execute action in environment
action_t = agent.select_action(state_t, agent.exploration)
_, reward_t, terminal, info = env.step(action_t)
#print("step: ", steps, "action: ",action_t ,"reward: ", reward_t)
print(action_t , end="")
img_tmp = cv2.resize(info, (32, 32), interpolation=cv2.INTER_NEAREST)
state_t_1 = tf.image.convert_image_dtype(img_tmp, dtype=tf.float32)
state_t_1 = tf.reshape(state_t_1,(-1,32,32,4))
# store experience
agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal)
# experience replay
# for log
frame += 1
loss += agent.current_loss
Q_max += np.max(agent.Q_values(state_t))
cumulated_rewards += reward_t
print(" ")
print("episodes:",e," steps:",steps," loss:",'{0:.2f}'.format(loss/(steps+1)), " terminal:",terminal, " exploration_factor:",agent.exploration , " reward:", '{0:.2f}'.format(cumulated_rewards))
#print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format(
# e, total_episodes - 1, cumulated_rewards, loss / frame, Q_max / frame))
# save model
weights_tmp = cv2.resize(weights, (256,256), interpolation=cv2.INTER_NEAREST)
weights_image = tf.image.convert_image_dtype(weights_tmp, dtype=tf.float32)
# save model
And here is the DQN agent code: (I think the problem is in initilizer of DQN agent code)
from collections import deque
import os
import numpy as np
import tensorflow as tf
class DQNAgent:
Multi Layer Perceptron with Experience Replay
def __init__(self, enable_actions, environment_name):
# parameters = os.path.splitext(os.path.basename(__file__))[0]
self.environment_name = environment_name
self.enable_actions = enable_actions
self.n_actions = len(self.enable_actions)
self.minibatch_size = 64
self.replay_memory_size = 1000
self.learning_rate = 0.001
self.discount_factor = 0.9
self.exploration = 1.0
self.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
self.model_name = "{}.ckpt".format(self.environment_name)
# replay memory
self.D = deque(maxlen=self.replay_memory_size)
# model
# variables
self.current_loss = 0.0
def init_model(self):
# input layer (32 x 32 x 4)
self.x = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1 = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1 = tf.Variable(tf.zeros([4]))
self.c_cv1 = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1 = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1 = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1 = tf.Variable(tf.zeros([256]))
self.h_fc1 = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2 = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2 = tf.Variable(tf.zeros([32]))
self.h_fc2 = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out = tf.Variable(tf.zeros([self.n_actions]))
self.y = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y_ = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam = optimizer.minimize(self.loss)
# input layer (32 x 32 x 4)
self.x_t = tf.placeholder(tf.float32, [None, 32, 32,4])
# convolution layer
self.W_cv1_t = tf.Variable(tf.truncated_normal([5, 5, 4, 4], stddev=0.01))#4filters
self.b_cv1_t = tf.Variable(tf.zeros([4]))
self.c_cv1_t = tf.nn.conv2d(self.x, self.W_cv1, strides=[1, 1, 1, 1], padding='SAME')
self.h_cv1_t = tf.nn.relu(self.c_cv1 + self.b_cv1)
# flatten (4096)
self.x_flat_t = tf.reshape(self.h_cv1, [-1,4096])
# fully connected layer [1,256]
self.W_fc1_t = tf.Variable(tf.truncated_normal([4096, 256], stddev=0.01))
self.b_fc1_t = tf.Variable(tf.zeros([256]))
self.h_fc1_t = tf.nn.relu(tf.matmul(self.x_flat, self.W_fc1) + self.b_fc1)
# fully connected layer [1,32]
self.W_fc2_t = tf.Variable(tf.truncated_normal([256,32], stddev=0.01))
self.b_fc2_t = tf.Variable(tf.zeros([32]))
self.h_fc2_t = tf.nn.relu(tf.matmul(self.h_fc1, self.W_fc2) + self.b_fc2)
# output layer (n_actions)
self.W_out_t = tf.Variable(tf.truncated_normal([32, self.n_actions], stddev=0.01))
self.b_out_t = tf.Variable(tf.zeros([self.n_actions]))
self.y_t = tf.matmul(self.h_fc2, self.W_out) + self.b_out
# loss function
self.y__t = tf.placeholder(tf.float32, [None, self.n_actions])
self.loss_t = tf.reduce_mean(tf.square(self.y_ - self.y))
# train operation
optimizer_t = tf.train.AdamOptimizer(self.learning_rate) #changed from RMS to Adam
self.training_t = optimizer.minimize(self.loss)
# saver
self.saver = tf.train.Saver()
# session
self.sess = tf.Session()
def Q_values(self, state):
# Q(state, action) of all actions
#print("QQQ VALUES______________________________________________",
x_tmp =
return, feed_dict={self.x: x_tmp})#[0]
def select_action(self, state, epsilon):
if np.random.rand() <= epsilon:
# random
return np.random.choice(self.enable_actions)
# max_action Q(state, action)
#print("G" , end="")
return self.enable_actions[np.argmax(self.Q_values(state))]
def store_experience(self, state, action, reward, state_1, terminal):
self.D.append((state, action, reward, state_1, terminal))
def experience_replay(self):
state_minibatch = []
y_minibatch = []
# sample random minibatch
minibatch_size = min(len(self.D), self.minibatch_size)
minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size)
for j in minibatch_indexes:
state_j, action_j, reward_j, state_j_1, terminal = self.D[j]
action_j_index = self.enable_actions.index(action_j)
y_j = self.Q_values(state_j)[0]
if terminal:
y_j[action_j_index] = reward_j
# reward_j + gamma * max_action' Q(state', action')
y_j[action_j_index] = reward_j + self.discount_factor * np.max(self.Q_values(state_j_1)) # NOQA
x_tmp =
# training, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
# for log
self.current_loss =, feed_dict={self.x: state_minibatch, self.y_: y_minibatch})
def load_model(self, model_path=None):
if model_path:
# load from model_path
self.saver.restore(self.sess, model_path)
# load from checkpoint
checkpoint = tf.train.get_checkpoint_state(self.model_dir)
if checkpoint and checkpoint.model_checkpoint_path:
self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
def save_model(self):, os.path.join(self.model_dir, self.model_name))
Thanks for you help.

LSTM setting/resetting the state when using a variable batch size

I have built this LSTM class:
import tensorflow as tf
import Constants
class LSTM():
def __init__(self,
self.inputs = tf.placeholder(tf.float32, [None] + inputShape)
self.labels = tf.placeholder(tf.float32, [None] + outputShape)
self.inputTensors = tf.unstack(self.inputs, axis=1)
self.weights = tf.Variable(tf.random_normal([numHidden] + outputShape))
self.bias = tf.Variable(tf.random_normal(outputShape))
layers = [tf.contrib.rnn.LSTMCell(numHidden, forget_bias=forgetBias, state_is_tuple=True)] * numLayers
self.cell = tf.contrib.rnn.MultiRNNCell(layers, state_is_tuple=True)
self.optimiser = tf.train.GradientDescentOptimizer(learningRate)
self.forgetBias = forgetBias
self.batchDict = None
self.outputs = None
self.finalStates = None
self.predictions = None
self.loss = None
self.accuracy = None
self.optimise = None
self.session = tf.Session()
def __buildGraph(self):
outputs, finalStates = tf.nn.static_rnn(self.cell, self.inputTensors, dtype=tf.float32)
predictions = tf.add(tf.matmul(outputs[-1], self.weights), self.bias)
self.predictions = tf.minimum(tf.maximum(predictions, 0), 1)
self.loss = tf.losses.mean_squared_error(predictions=self.predictions, labels=self.labels)
self.accuracy = tf.reduce_mean(1 - tf.abs(self.labels - self.predictions) / 1.0)
self.optimise = self.optimiser.minimize(self.loss)
def __execute(self, operation):
return, self.batchDict)
def setBatch(self, inputs, labels):
self.batchDict = {self.inputs: inputs, self.labels: labels}
def batchLabels(self):
return self.__execute(self.labels)
def batchPredictions(self):
return self.__execute(self.predictions)
def batchLoss(self):
return self.__execute(self.loss)
def batchAccuracy(self):
return self.__execute(self.accuracy)
def processBatch(self):
def kill(self):
and I run it like so:
import DataWorker
import Constants
from Model import LSTM
inputShape = [Constants.sequenceLength, DataWorker.numFeatures]
outputShape = [1]
LSTM = LSTM(inputShape, outputShape)
# #############################################
# #############################################
for epoch in range(Constants.numEpochs):
print("***** EPOCH:", epoch + 1, "*****\n")
IDPointer, TSPointer = 0, 0
epochComplete = False
batchNum = 0
while not epochComplete:
batchNum += 1
batchX, batchY, IDPointer, TSPointer, epochComplete = DataWorker.generateBatch(IDPointer, TSPointer)
LSTM.setBatch(batchX, batchY)
if batchNum % Constants.printStep == 0 or epochComplete:
print("Batch:\t\t", batchNum)
print("Last Pred:\t", LSTM.batchPredictions()[-1][0])
print("Last Label:\t", LSTM.batchLabels()[-1][0])
print("Loss:\t\t", LSTM.batchLoss())
print("Accuracy:\t", str("%.2f" % (LSTM.batchAccuracy() * 100) + "%\n"))
# #############################################
# #############################################
testX, testY = DataWorker.generateTestBatch()
LSTM.setBatchDict(testX, testY)
testAccuracy = LSTM.batchAccuracy()
print("Testing Accuracy:", str("%.2f" % (testAccuracy * 100) + "%"))
This all works well as it should. However, I am using time series data which consists of financial stocks spanning over ranges of timestamps far greater than the number of time steps that my LSTM is unrolled for - Constants.sequenceLength. Because of this, it takes many sequential batches for a single stock t be processed, and so the state/memory of my LSTM needs to be passed between batches. As well as this, after a batch that completes the lifespan of an ID, the next batch would be passing in a new ID from the initial timestamp of my dataset, and so I would want to reset the memory.
There are many questions asking something similar, and all of the answers are adequate, however, none seem to address the issue of using variable batch sizes - batch sizes initialised to None and then inferred when a batch is passed in. My batches are usually a constant size, but do change under certain circumstances and I cannot change this. How can I have control over passing the state between batches, as well as resetting the state, if I have not specified the batch size?

