VAE with a discriminator compiling problem

VAE with a discriminator compiling problem - python

As opposed to native generative models, the input for this vae is a RGB image. Here if I compile the self.combined using add_loss method, the loss goes around 15000 to -22000. Compiling using mse works fine.
def __init__(self,type = 'landmark'):
self.latent_dim = 128
self.input_shape = (128,128,3)
self.batch_size = 1
self.original_dim = self.latent_dim*self.latent_dim
patch = int(self.input_shape[0] / 2**4)
self.disc_patch = (patch, patch, 1)
optimizer = tf.keras.optimizers.Adam(0.0002, 0.5)
pd = patch_discriminator(type)
self.discriminator = pd.discriminator()
self.discriminator.compile(loss = 'binary_crossentropy',optimizer = optimizer)
self.discriminator.trainable = False
vae = VAE(self.latent_dim,type = type)
encoder = vae.inference_net()
decoder = vae.generative_net()
if type == 'image':
self.orig_out = tf.random.normal(shape = (self.batch_size,128,128,3))
else:
self.orig_out = tf.random.normal(shape = (self.batch_size,128,128,1))
vae_input = tf.keras.layers.Input(shape = self.input_shape)
self.encoder_out = encoder(vae_input)
self.decoder_out = decoder(self.encoder_out[2])
self.generator = tf.keras.Model(vae_input,self.decoder_out)
vae_loss = self.compute_loss()
self.generator.add_loss(vae_loss)
self.generator.compile(optimizer = optimizer)
valid = self.discriminator([self.decoder_out,self.decoder_out])
self.combined = tf.keras.Model(vae_input,valid)
self.combined.add_loss(vae_loss)
self.combined.compile(optimizer = optimizer)
# self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)
self.dl = DataLoader()
compute loss computes kl loss for VAE. Initially self.orig_out is set as normal tensor and is updated in training loop below.
def compute_loss(self):
bce = tf.keras.losses.BinaryCrossentropy()
reconstruction_loss = bce(self.decoder_out,self.orig_out)
reconstruction_loss = self.original_dim*reconstruction_loss
z_mean = self.encoder_out[0]
z_log_var = self.encoder_out[1]
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
return vae_loss
Training loop:
def train(self,batch_size = 1,epochs = 10):
start_time = datetime.datetime.now()
valid = np.ones((batch_size,) + self.disc_patch)
fake = np.zeros((batch_size,) + self.disc_patch)
threshold = epochs//10
for epoch in range(epochs):
for batch_i,(imA,imB,n_batches) in enumerate(self.dl.load_batch(target='landmark',batch_size=batch_size)):
self.orig_out = tf.convert_to_tensor(imB, dtype=tf.float32)
fakeA = self.generator.predict(imA)
d_real_loss = self.discriminator.train_on_batch([imB,imB],valid)
d_fake_loss = self.discriminator.train_on_batch([imB,fakeA],fake)
d_loss = 0.5*np.add(d_real_loss,d_fake_loss)
combined_loss = self.combined.train_on_batch(imA)
#combined_loss = self.combined.train_on_batch(imA,valid)
elapsed_time = datetime.datetime.now() - start_time
print (f"[Epoch {epoch}/{epochs}] [Batch {batch_i}/{n_batches}] [D loss: {d_loss}] [G loss: {combined_loss}] time: {elapsed_time}")
If I compile self.combined with kl loss using add_loss() method, I am not able to pass outputs during train_on_batch as shown above. Thus the generator won't learn and produces random outputs. How do I compile vae with discriminator using kl loss ?

I don't know if this will be the right answer, but VAE can be modeled using Tensorflow more easily since it deals with custom training loops.
You can follow this link which may contain some relevant information for your problem.

Related

How loss backpropagated to provide the effective weights for next iteration of batch?

I tried to fuse the multiple losses but that are not effecting the weights for next iteration, I would like to fuse the loss function because I am getting multiple output from the generating model. How may I solve it?
def train(self, epochs, batch_size=1, sample_interval=50):
LAMBDA = 0.1
start_time = datetime.datetime.now()
for epoch in range(epochs):
if epoch % 500 == 0:
optimizer = Adam(0.0001, 0.5)
self.combined.compile(loss=['mae'], loss_weights=[1, 100], optimizer=optimizer)
for batch_i, (imgs_A, imgs_B) in enumerate(self.data_loader.load_batch(batch_size)):
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
fake_A1, fake_A2, fake_A3 = self.generator.predict(imgs_B)
g1_loss = self.combined.train_on_batch([imgs_A, imgs_B], [fake_A1])
fake_A2 = tf.image.resize(fake_A2, [256, 256])
fake_A3 = tf.image.resize(fake_A3, [256, 256])
g2_loss = self.combined.train_on_batch([imgs_A, imgs_B], [fake_A2])
g3_loss = self.combined.train_on_batch([imgs_A, imgs_B], [fake_A3])
g1_loss = tf.cast(g1_loss, tf.double)
g2_loss = tf.cast(g2_loss, tf.double)
g3_loss = tf.cast(g3_loss, tf.double)
g_loss = g1_loss + g2_loss + g3_loss
imgs_A = np.fft.fft2(imgs_A)
fake_A1 = np.fft.fft2(fake_A1)
fake_A2 = np.fft.fft2(fake_A2)
fake_A3 = np.fft.fft2(fake_A3)
mse = tf.keras.losses.MeanSquaredError()
l1_loss = mse(imgs_A, fake_A1).numpy()
l2_loss = mse(imgs_A, fake_A2).numpy()
l3_loss = mse(imgs_A, fake_A3).numpy()
l1_loss = tf.cast(l1_loss, tf.double)
l2_loss = tf.cast(l2_loss, tf.double)
l3_loss = tf.cast(l3_loss, tf.double)
l_loss = l1_loss + l2_loss + l3_loss
#Normalized loss
g_loss = g_loss + (LAMBDA * l_loss)
elapsed_time = datetime.datetime.now() - start_time
print("[Epoch %d/%d] [Batch %d/%d] [G loss: %f] time: %s" % (epoch, epochs, batch_i, self.data_loader.n_batches, g_loss, elapsed_time))
How do we backword the loss after normalization which is stored into g_loss using following formula: g_loss = g_loss + (LAMBDA * l_loss)
Such as the loss has been backpropagated in pytorch code present in the following link: https://github.com/chosj95/MIMO-UNet/blob/main/train.py
which is backpropagated using following statements in pytorch:
loss.backward()
optimizer.step()

Why pytorch model cannot recognize the tensors I defined?

I'm just learn pytorch recently.
And I try to write a same model like the paper that I have read for practice.
This is the PDF of the paper I refer.
https://dl.acm.org/doi/pdf/10.1145/3178876.3186066?download=true
Here is the code what I wrote.
class Tem(torch.nn.Module):
def __init__(self, embedding_size, hidden_size):
super(Tem, self).__init()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.leaf_size = 0
self.xgb_model = None
self.vec_embedding = None
self.multi_hot_Q = None
self.user_embedding = torch.nn.Linear(1, embedding_size)
self.item_embedding = torch.nn.Linear(1, embedding_size)
def pretrain(self, ui_attributes, labels):
print("Start XGBoost Training...")
self.xgb_model = XGBoost(ui_attributes, labels)
self.leaf_size = self.xgb_model.leaf_size
self.vec_embedding = Variable(torch.rand(self.embedding_size, self.leaf_size, requires_grad=True))
self.h = Variable(torch.rand(self.hidden_size, 1, requires_grad=True))
self.att_w = Variable(torch.rand(2 * self.embedding_size, self.hidden_size, requires_grad=True))
self.att_b = Variable(torch.rand(self.leaf_size, self.hidden_size, requires_grad=True))
self.r_1 = Variable(torch.rand(self.embedding_size, 1, requires_grad=True))
self.r_2 = Variable(torch.rand(self.embedding_size, 1, requires_grad=True))
self.bias = Variable(torch.rand(1, 1, requires_grad=True))
def forward(self, ui_ids, ui_attributes):
if self.xgb_model == None:
raise Exception("Please run Tem.pretrain() to pre-train XGBoost model first.")
n_data = len(ui_ids)
att_input = torch.FloatTensor(ui_attributes)
self.multi_hot_Q = torch.FloatTensor(self.xgb_model.multi_hot(att_input)).permute(0,2,1)
vq = self.vec_embedding * self.multi_hot_Q
id_input = torch.FloatTensor(ui_ids)
user_embedded = self.user_embedding(id_input[:,0].reshape(n_data, 1))
item_embedded = self.item_embedding(id_input[:,1].reshape(n_data, 1))
ui = (user_embedded * item_embedded).reshape(n_data, self.embedding_size, 1)
ui_repeat = ui.repeat(1, 1, self.leaf_size)
cross = torch.cat([ui_repeat, vq], dim=1).permute(0,2,1)
re_cross = corss.reshape(cross.shape[0] * cross.shape[1], cross.shape[2])
attention = torch.mm(re_cross, self.att_w)
attention = F.leaky_relu(attention + self.att_b.repeat(n_data, 1))
attention = torch.mm(attention, self.h).reshape(n_data, self.leaf_size)
attention = F.softmax(attention).reshape(n_data, self.leaf_size, 1)
attention = self.vec_embedding.permute(1,0) * attention.repeat(1,1,20)
pool = torch.max(attention, 1).values
y_hat = self.bias.repeat(n_data, 1) + torch.mm(ui.reshape(n_data, self.embedding_size), self.r_1) + torch.mm(pool, self.r_2)
y_hat = F.softmax(torch.nn.Linear(1, 2)(y_hat))
return y_hat
My question is...It seems torch didn't know what tensor should be calculate gradient in backward propagation.
print(tem)
Tem(
(user_embedding): Linear(in_features=1, out_features=20, bias=True)
(item_embedding): Linear(in_features=1, out_features=20, bias=True)
)
I googled this problem, someone says those tensors should use torch.autograd.Variable(), but it didn't solve my problem. And someone says autograd directly supports tensors now. torch.autograd.Variable() is not necessary.
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.Adagrad(tem.parameters(), lr=0.02)
for t in range(20):
prediction = tem(ids_train, att_train)
loss = loss_func(prediction, y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if t % 5 == 0:
print("loss: ", loss)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)

Your problem is not related to Variable. As you said, it's not necessary anymore. To compute the gradients of a tensor declared in a model (that extends nn.Module) you need to include them into the model's parameters using the method nn.Parameter(). For example, to include self.h, you can do:
self.h = nn.Parameter(torch.zeros(10,10)
Now, when you call loss.backward() it'll collect the gradient for this variable (of course, loss must be dependent on self.h).

Ran out of Ram while training LSTM

I am kind of a beginner in RNNs, so I coded a LSTM architecture using Pytorch, but I always run out of RAM whenever I am in the 3rd epoch. I am already using a DataLoader and I tried to detach the gradient from the input tensor but it doesn't solve the problem out.
This is my training loop
writer = SummaryWriter()
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index = 0)
optimizer = optim.Adam(lstm.parameters(), lr = 1e-5)
gradient_clip = clip_grad_norm_(lstm.parameters(), max_norm = 5)
num_epochs = 20
epoch_loss = -1.0
loss = - 1
t = trange(num_epochs, desc= "Epoch loss", leave=True)
for epoch in t:
trainLoader = iter(DataLoader(dataset, batch_size = batch_size))
tt = trange(len(trainLoader)-1, desc= "Batch loss", leave=True)
for i in tt:
text, embedding = next(trainLoader)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
y = lstm.forward(embedding.transpose(1,0))
labels = text.transpose(0,1)[1:].transpose(0,1).flatten()
loss = criterion(y.reshape(-1, y.shape[-1]), labels)
tt.set_description("Batch loss : %.4f" % loss)
tt.refresh()
loss.backward(retain_graph=True)
optimizer.step()
epoch_loss += loss
epoch_loss = epoch_loss / (len(trainLoader) - 1)
# Saving model
save_date = datetime.now().strftime("%d%m%Y-%H:%M:%S")
PATH = './save/lstm_model_'+save_date
torch.save(lstm, PATH)
# Updating progression bar
t.set_description("Epoch loss : %.4f" % epoch_loss)
t.refresh()
# Plotting gradients histograms in Tensorboard
writer.add_scalar('Text_generation_Loss/train', epoch_loss, epoch)
for tag, parm in lstm.named_parameters():
with torch.no_grad():
writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)
writer.flush()
print('Finished Training')
writer.close()
And this is the LSTM class that I built:
class LSTM(nn.Module):
def __init__(self, in_size : int, hidden_size : int):
super().__init__()
self.in_size = in_size
self.hidden_size = hidden_size
self.W_fi = nn.Linear(in_size,hidden_size)
self.W_fh = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_ii = nn.Linear(in_size,hidden_size)
self.W_ih = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_Ci = nn.Linear(in_size,hidden_size)
self.W_Ch = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_oi = nn.Linear(in_size,hidden_size)
self.W_oh = nn.Linear(hidden_size,hidden_size, bias=False)
self.sigmoid = nn.Sigmoid()
self.tanh = nn.Tanh()
def one_step(self, x, h, C):
f_t = self.sigmoid(self.W_fi(x) + self.W_fh(h))
i_t = self.sigmoid(self.W_ii(x) + self.W_ih(h))
g_t = self.tanh(self.W_Ci(x) + self.W_Ch(h))
C_t = torch.mul(f_t, C) + torch.mul(i_t, g_t)
o_t = self.sigmoid(self.W_oi(x) + self.W_oh(h))
h_t = torch.mul(o_t, self.tanh(C_t))
return h_t, C_t
def forward(self, X):
h_out = []
h = - torch.ones(X.shape[1], self.hidden_size)
C = - torch.ones(X.shape[1], self.hidden_size)
h_t, C_t = self.one_step(X[0], h, C)
h_out.append(h_t)
for i in range(1, X.shape[0] - 1):
h_t, C_t = self.one_step(X[i], h_t, C_t)
h_out.append(h_t)
h_out = torch.cat(h_out)
return h_out #h_out.reshape(-1,batch_size,num_embeddings)
I already searched for a similar case but I wasn't able to find a solution

I don't know if it may help somebody, but I solved the problem. I wasn't perhaps clear about the task, but the goal was to make text generation. The first thing I was doing is embed the sentences using torch.nn.embedding that was defined outside my LSTM. The solution was to include it as a layer of my network, since the embedding is not a pretrained one and should be learned too.

Is it possible to use objects with Google's Jax machine learning library

I am trying to write a DC Gan network using Google's Jax machine learning library. To do this, I created objects to serve as the discriminator and generator, however, as I was testing the discriminator, I got the error:
TypeError: Argument '<__main__.Discriminator object at 0x7fdfa5c6ffd0>' of type <class '__main__.Discriminator'> is not a valid JAX type
I looked through the examples on the Jax github page, and, from what I saw, none of the examples there use objects, which leads me to hypothesize that it is probably just not possible to use objects with Jax. But if this is the case, I don't really understand why the use of objects wouldn't be possible, and would this be something that will be implemented in the future? Am I just naively overlooking something?
Here is my Discriminator object:
class Discriminator():
def __init__(self):
self.step_size = 0.0001
self.image_shape = (256,256,3)
self.params = []
num_layers = 6
num_filters = 64
filter_size = 4
self.params.append(create_conv_layer(3,
num_filters,
filter_size,
filter_size,
random.PRNGKey(0)))
for l in range(1, num_layers):
self.params.append(create_conv_layer(64*2**(l-1),
64*2**l,
filter_size,
filter_size,
random.PRNGKey(0)))
self.params.append(create_conv_layer(64*2**num_filters,
1,
filter_size,
filter_size,
random.PRNGKey(0)))
def predict(self):
activations = image
for w, b in params[:-1]:
outputs = conv_forward(activations,w,b,stride=2)
outputs = batch_normalization(outputs)
activations = leaky_relu(outputs)
final_w, final_b = params[-1]
return sigmoid(conv_forward(activations,final_w,final_b,))
def batched_predict(self, images):
shape = [None] + list(self.image_shape)
return vmap(self.predict, in_axes=shape)(self.params, images)
def loss(self, params, images, targets):
preds = self.batched_predict(params, images)
return -np.sum(preds * targets)
def accuracy(self, images, targets):
predicted_class = np.round(np.ravel(batched_predict(images)))
return np.mean(predicted_class == target_class)
#jit
def update(self, params, x, y):
grads = grad(self.loss)(params, x, y)
return [(w - self.step_size * dw, b - self.step_size * db)
for (w, b), (dw, db) in zip(params, grads)]
And I update the parameters here:
num_epochs = 5
batch_size = 64
steps_per_epoch = train_images.shape[0] // batch_size
discrim = Discriminator()
params = discrim.params
print("lets-a-go!")
for epoch in range(num_epochs):
start_time = time.time()
for step in range(steps_per_epoch):
x, y = simple_data_generator(batch_size)
params = discrim.update(params, x, y)
epoch_time = time.time() - start_time
train_acc = discrim.accuracy(train_images, train_labels)
test_acc = discrim.accuracy(test_images, test_labels)
print("Epoch {} in {:0.2f} sec".format(epoch, epoch_time))
print("Training set accuracy {}".format(train_acc))
print("Test set accuracy {}".format(test_acc))

Slightly edited tensorflow official example doesn't run properly

I'm working on word2vec via distributed tensorflow. For a compatible reason, just slightly edit official word2vec to a Model kinda coding arch.
Code Snippet as follows:
def build():
self.global_step = tf.train.get_or_create_global_step()
with tf.variable_scope("weights", partitioner=partitioner):
self.embeddings = tf.get_variable(name="embeddings", shape=(self.vocab_size, self.embedding_size), initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
self.nce_weights = tf.get_variable(name="nce_weights", shape=(self.vocab_size, self.embedding_size), initializer=tf.truncated_normal_initializer(stddev=1.0/math.sqrt(self.embedding_size)))
self.bias = tf.get_variable(name="bias", shape=(self.vocab_size), initializer=tf.zeros_initializer())
self.embeded = tf.nn.embedding_lookup(self.embeddings, inputs, partition_strategy='div')
print("lables: ", self.labels)
self.loss = tf.reduce_mean(
tf.nn.nce_loss(
weights = self.nce_weights,
biases = self.bias,
labels = self.labels,
inputs = self.embeded,
num_sampled = self.num_sampled,
num_classes = self.vocab_size,
partition_strategy="div"
)
)
self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
# evaluate
normized = tf.sqrt(tf.reduce_sum(tf.square(self.embeddings), 1, keepdims=True))
normallized_embeddings = self.embeddings / normized
valid_data = np.r_[1:5]
self.valid_size = len(valid_data)
evaluate_examples = tf.constant(valid_data)
valid_embeddings = tf.nn.embedding_lookup(normallized_embeddings, evaluate_examples)
self.similarity = tf.matmul(valid_embeddings, normallized_embeddings, transpose_b=True)
The train method:
def train(args):
loss, _, global_step, embs = session.run([self.loss, self.optimizer, self.global_step, self.embeddings])
print(embs)
Training:
def main():
model = Word2vec(args)
model.build() # call the method above to build the graph
tf.global_variables_initializer()
with tf.Session() as sess:
while num_step < upperboud:
model.train(sess)
I print out the evaluation results during training, and found no changes all the time, but nce_weights are changing. And global_step and local_step are increasing. Not sure where is wrong, anyone can help point out ? Thanks

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

VAE with a discriminator compiling problem - python

I don't know if this will be the right answer, but VAE can be modeled using Tensorflow more easily since it deals with custom training loops. You can follow this link which may contain some relevant information for your problem.

Related

How loss backpropagated to provide the effective weights for next iteration of batch?

Why pytorch model cannot recognize the tensors I defined?

Ran out of Ram while training LSTM

Is it possible to use objects with Google's Jax machine learning library

Slightly edited tensorflow official example doesn't run properly

Categories

Resources