My model training speed becomes slower over time. Every epoch take longer time to train.
Here is the full source code with my preprocess sentiment tree bank data (put glove.840B.300d.txt into data/glove).
Install some python packages:
pip install meowlogtool
pip install tqdm
Command to run:
python --emblr 0 --rel_dim 0 --tag_dim 0 --optim adagrad --name basic --lr 0.05 --wd 1e-4 --at_hid_dim 0
Model source code for you to read
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable as Var
import utils
import Constants
from model import SentimentModule
from embedding_model import EmbeddingModel
class SimpleGRU(nn.Module):
w[i] : (300, 1)
h[i] : (150, 1)
p[i] : (20, 1)
r[i] : (20, 1)
k[i] : (150, 1)
x[i] : (20 + 150 + 300 + 20 = 490, 1) (490, 1)
Uz, Ur, Uh : (150, 150) => 67500 => (450, 450)
Wz, Wr, Wh : (150, 20 + 150 + 300 + 20) (150, 490)
def __init__(self, cuda, in_dim, hid_dim):
super(SimpleGRU, self).__init__()
self.cudaFlag = cuda
self.Uz = nn.Linear(hid_dim, hid_dim)
self.Ur = nn.Linear(hid_dim, hid_dim)
self.Uh = nn.Linear(hid_dim, hid_dim)
self.Wz = nn.Linear(in_dim, hid_dim)
self.Wr = nn.Linear(in_dim, hid_dim)
self.Wh = nn.Linear(in_dim, hid_dim)
if self.cudaFlag:
self.Uz = self.Uz.cuda()
self.Ur = self.Uz.cuda()
self.Uh = self.Uz.cuda()
self.Wz = self.Wz.cuda()
self.Wr = self.Wr.cuda()
self.Wh = self.Wh.cuda()
def forward(self, x, h_prev):
Simple-GRU(compress_x[v], h[t-1]) :
z[t] := s(Wz *compress_x[t]+ Uz * h[t-1] + bz)
r[t] := s(Wr * compress_x[t] + Ur * h[t-1] + br)
h_temp[t] := g(Wh * compress_x[t] + Uh * h[t-1] + bh)
h[t] := r[t] .* h[t-1] + (1 - z[t]) .* h_temp[t]
return h[t]
:param x: compress_x[t]
:param h_prev: h[t-1]
z = F.sigmoid(self.Wz(x) + self.Uz(h_prev))
r = F.sigmoid(self.Wr(x) + self.Ur(h_prev))
h_temp = F.tanh(self.Wh(x) + self.Uh(h_prev))
h = r*h_prev + (1-z)*h_temp
return h
class TreeSimpleGRU(nn.Module):
def __init__(self, cuda, word_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, criterion, leaf_h = None):
super(TreeSimpleGRU, self).__init__()
self.cudaFlag = cuda
# self.gru_cell = nn.GRUCell(word_dim + tag_dim, mem_dim)
self.gru_cell = SimpleGRU(self.cudaFlag, word_dim+tag_dim, mem_dim)
self.gru_at = GRU_AT(self.cudaFlag, word_dim + tag_dim + rel_dim + mem_dim, at_hid_dim ,mem_dim)
self.mem_dim = mem_dim
self.in_dim = word_dim
self.tag_dim = tag_dim
self.rel_dim = rel_dim
self.leaf_h = leaf_h # init h for leaf node
if self.leaf_h == None:
self.leaf_h = Var(torch.rand(1, self.mem_dim)), 'leaf_h.pth')
if self.cudaFlag:
self.leaf_h = self.leaf_h.cuda()
self.criterion = criterion
self.output_module = None
def getParameters(self):
Get flatParameters
note that getParameters and parameters is not equal in this case
getParameters do not get parameters of output module
:return: 1d tensor
params = []
for m in [self.gru_cell, self.gru_at]:
# we do not get param of output module
l = list(m.parameters())
one_dim = [p.view(p.numel()) for p in params]
params =
return params
def set_output_module(self, output_module):
self.output_module = output_module
def forward(self, tree, w_emb, tag_emb, rel_emb, training = False):
loss = Var(torch.zeros(1)) # init zero loss
if self.cudaFlag:
loss = loss.cuda()
for idx in xrange(tree.num_children):
_, child_loss = self.forward(tree.children[idx], w_emb, tag_emb, rel_emb, training)
loss = loss + child_loss
if tree.num_children > 0:
child_rels, child_k = self.get_child_state(tree, rel_emb)
if self.tag_dim > 0:
tree.state = self.node_forward(w_emb[tree.idx - 1], tag_emb[tree.idx -1], child_rels, child_k)
tree.state = self.node_forward(w_emb[tree.idx - 1], None, child_rels, child_k)
elif tree.num_children == 0:
if self.tag_dim > 0:
tree.state = self.leaf_forward(w_emb[tree.idx - 1], tag_emb[tree.idx -1])
tree.state = self.leaf_forward(w_emb[tree.idx - 1], None)
if self.output_module != None:
output = self.output_module.forward(tree.state, training)
tree.output = output
if training and tree.gold_label != None:
target = Var(utils.map_label_to_target_sentiment(tree.gold_label))
if self.cudaFlag:
target = target.cuda()
loss = loss + self.criterion(output, target)
return tree.state, loss
def leaf_forward(self, word_emb, tag_emb):
Forward function for leaf node
:param word_emb: word embedding of current node u
:param tag_emb: tag embedding of current node u
:return: k of current node u
h = self.leaf_h
if self.cudaFlag:
h = h.cuda()
if self.tag_dim > 0:
x =[word_emb, tag_emb], 1)
x = word_emb
k = self.gru_cell(x, h)
return k
def node_forward(self, word_emb, tag_emb, child_rels, child_k):
Foward function for inner node
:param word_emb: word embedding of current node u
:param tag_emb: tag embedding of current node u
:param child_rels (tensor): rels embedding of child node v
:param child_k (tensor): k of child node v
n_child = child_k.size(0)
h = Var(torch.zeros(1, self.mem_dim))
if self.cudaFlag:
h = h.cuda()
for i in range(0, n_child):
k = child_k[i]
x_list = [word_emb, k]
if self.rel_dim >0:
rel = child_rels[i]
if self.tag_dim > 0:
x =, 1)
h = self.gru_at(x, h)
k = h
return k
def get_child_state(self, tree, rels_emb):
Get child rels, get child k
:param tree: tree we need to get child
:param rels_emb (tensor):
if tree.num_children == 0:
assert False # never get here
child_k = Var(torch.Tensor(tree.num_children, 1, self.mem_dim))
if self.rel_dim>0:
child_rels = Var(torch.Tensor(tree.num_children, 1, self.rel_dim))
child_rels = None
if self.cudaFlag:
child_k = child_k.cuda()
if self.rel_dim > 0:
child_rels = child_rels.cuda()
for idx in xrange(tree.num_children):
child_k[idx] = tree.children[idx].state
if self.rel_dim > 0:
child_rels[idx] = rels_emb[tree.children[idx].idx - 1]
return child_rels, child_k
class AT(nn.Module):
AT(compress_x[v]) := sigmoid(Wa * tanh(Wb * compress_x[v] + bb) + ba)
def __init__(self, cuda, in_dim, hid_dim):
super(AT, self).__init__()
self.cudaFlag = cuda
self.in_dim = in_dim
self.hid_dim = hid_dim
self.Wa = nn.Linear(hid_dim, 1)
self.Wb = nn.Linear(in_dim, hid_dim)
if self.cudaFlag:
self.Wa = self.Wa.cuda()
self.Wb = self.Wb.cuda()
def forward(self, x):
out = F.sigmoid(self.Wa(F.tanh(self.Wb(x))))
return out
class GRU_AT(nn.Module):
def __init__(self, cuda, in_dim, at_hid_dim ,mem_dim):
super(GRU_AT, self).__init__()
self.cudaFlag = cuda
self.in_dim = in_dim
self.mem_dim = mem_dim
self.at_hid_dim = at_hid_dim
if at_hid_dim > 0: = AT(cuda, in_dim, at_hid_dim)
self.gru_cell = SimpleGRU(self.cudaFlag, in_dim, mem_dim)
if self.cudaFlag:
if at_hid_dim > 0: =
self.gru_cell = self.gru_cell.cuda()
def forward(self, x, h_prev):
:param x:
:param h_prev:
:return: a * m + (1 - a) * h[t-1]
m = self.gru_cell(x, h_prev)
if self.at_hid_dim > 0:
a =
h =, m) +, h_prev)
h = m
return h
class TreeGRUSentiment(nn.Module):
def __init__(self, cuda, in_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, num_classes, criterion):
super(TreeGRUSentiment, self).__init__()
self.cudaFlag = cuda
self.tree_module = TreeSimpleGRU(cuda, in_dim, tag_dim, rel_dim, mem_dim, at_hid_dim, criterion)
self.output_module = SentimentModule(cuda, mem_dim, num_classes, dropout=True)
def get_tree_parameters(self):
return self.tree_module.getParameters()
def forward(self, tree, sent_emb, tag_emb, rel_emb, training = False):
# sent_emb = F.torch.unsqueeze(self.word_embedding.forward(sent_inputs), 1)
# tag_emb = F.torch.unsqueeze(self.tag_emb.forward(tag_inputs), 1)
# rel_emb = F.torch.unsqueeze(self.rel_emb.forward(rel_inputs), 1)
# sent_emb, tag_emb, rel_emb = self.embedding_model(sent_inputs, tag_inputs, rel_inputs)
tree_state, loss = self.tree_module(tree, sent_emb, tag_emb, rel_emb, training)
output = tree.output
return output, loss
Why does neural network learning slow down as the error gets lower?
The reasons for the slowdown are not fully understood, but we have some basic ideas.
For classifiers, most training examples start out as incorrectly classified. Over time, more of them become correctly classified. Early in learning, you might have a nearly 100% error rate, so every example in the minibatch contributes to learning. Late in learning, you might have nearly a 0% error rate, so almost none of the examples in the minibatch contribute to learning. This problem can be resolved to some extent by using hard example mining or importance sampling. Both of these are just techniques for training on more difficult examples more often.
There are other more complicated reasons. One of them is that the condition number of the Hessian tends to worsen a lot as learning progresses, so that the optimal step size becomes smaller and smaller.
I have created a pytorch model and I want to reduce the model size.
Defining Model Architecture :-
import torch
import torch.quantization
import torch.nn as nn
import copy
import os
import time
import numpy as np
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.utils.prune as prune
import torch.nn.functional as F
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import time
import codecs
import pickle
import torch
from torch.autograd import Variable
import torch.nn.utils.prune as prune
from config import Config
from loader import *
from utils import *
from model import BiLSTM_CRF
def init_embedding(input_embedding):
Initialize embedding
bias = np.sqrt(3.0 / input_embedding.size(1))
nn.init.uniform(input_embedding, -bias, bias)
def init_linear(input_linear):
Initialize linear transformation
bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1)))
nn.init.uniform(input_linear.weight, -bias, bias)
if input_linear.bias is not None:
def init_lstm(input_lstm):
Initialize lstm
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.weight_ih_l' + str(ind))
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
weight = eval('input_lstm.weight_hh_l' + str(ind))
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
if input_lstm.bidirectional:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.weight_ih_l' + str(ind) + '_reverse')
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
weight = eval('input_lstm.weight_hh_l' + str(ind) + '_reverse')
bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
nn.init.uniform(weight, -bias, bias)
if input_lstm.bias:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.bias_ih_l' + str(ind))[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
weight = eval('input_lstm.bias_hh_l' + str(ind))[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
if input_lstm.bidirectional:
for ind in range(0, input_lstm.num_layers):
weight = eval('input_lstm.bias_ih_l' + str(ind) + '_reverse')[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
weight = eval('input_lstm.bias_hh_l' + str(ind) + '_reverse')[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
def to_scalar(var):
return var.view(-1).data.tolist()[0]
def argmax(vec):
_, idx = torch.max(vec, 1)
return to_scalar(idx)
def log_sum_exp(vec):
# vec 2D: 1 * tagset_size
max_score = vec[0, argmax(vec)]
max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
return max_score + \
torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
class BiLSTM_CRF(nn.Module):
def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, char_lstm_dim=25,
char_to_ix=None, pre_word_embeds=None, char_embedding_dim=25, use_gpu=False,
n_cap=None, cap_embedding_dim=None, use_crf=True, char_mode='CNN'):
super(BiLSTM_CRF, self).__init__()
self.use_gpu = use_gpu
self.embedding_dim = embedding_dim #100
self.hidden_dim = hidden_dim #200
self.vocab_size = vocab_size
self.tag_to_ix = tag_to_ix
self.n_cap = n_cap
self.cap_embedding_dim = cap_embedding_dim
self.use_crf = use_crf
self.tagset_size = len(tag_to_ix)
self.out_channels = char_lstm_dim #25
self.char_mode = char_mode
print('char_mode: %s, out_channels: %d, hidden_dim: %d, ' % (char_mode, char_lstm_dim, hidden_dim))
if self.n_cap and self.cap_embedding_dim:
self.cap_embeds = nn.Embedding(self.n_cap, self.cap_embedding_dim)
# print("self.cap_embeds.weight------",self.cap_embeds.weight)
if char_embedding_dim is not None:
self.char_lstm_dim = char_lstm_dim
self.char_embeds = nn.Embedding(len(char_to_ix), char_embedding_dim)
# print("self.char_embeds.weight-------", self.char_embeds.weight)
if self.char_mode == 'LSTM':
self.char_lstm = nn.LSTM(char_embedding_dim, char_lstm_dim, num_layers=1, bidirectional=True)
if self.char_mode == 'CNN':
self.char_cnn3 = nn.Conv2d(in_channels=1, out_channels=self.out_channels, kernel_size=(3, char_embedding_dim), padding=(2,0))
self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
if pre_word_embeds is not None:
self.pre_word_embeds = True
self.word_embeds.weight = nn.Parameter(torch.FloatTensor(pre_word_embeds))
self.pre_word_embeds = False
self.dropout = nn.Dropout(0.5)
if self.n_cap and self.cap_embedding_dim:
if self.char_mode == 'LSTM':
self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2+cap_embedding_dim, hidden_dim, bidirectional=True)
if self.char_mode == 'CNN':
self.lstm = nn.LSTM(embedding_dim+self.out_channels+cap_embedding_dim, hidden_dim, bidirectional=True)
if self.char_mode == 'LSTM':
self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2, hidden_dim, bidirectional=True)
if self.char_mode == 'CNN':
self.lstm = nn.LSTM(embedding_dim+self.out_channels, hidden_dim, bidirectional=True)
self.hw_trans = nn.Linear(self.out_channels, self.out_channels)
self.hw_gate = nn.Linear(self.out_channels, self.out_channels)
self.h2_h1 = nn.Linear(hidden_dim*2, hidden_dim)
self.tanh = nn.Tanh()
self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)
if self.use_crf:
self.transitions = nn.Parameter(
torch.zeros(self.tagset_size, self.tagset_size))[tag_to_ix[START_TAG], :] = -10000[:, tag_to_ix[STOP_TAG]] = -10000
def _score_sentence(self, feats, tags):
# tags is ground_truth, a list of ints, length is len(sentence)
# feats is a 2D tensor, len(sentence) * tagset_size
r = torch.LongTensor(range(feats.size()[0]))
if self.use_gpu:
r = r.cuda()
pad_start_tags =[torch.cuda.LongTensor([self.tag_to_ix[START_TAG]]), tags])
pad_stop_tags =[tags, torch.cuda.LongTensor([self.tag_to_ix[STOP_TAG]])])
pad_start_tags =[torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
pad_stop_tags =[tags, torch.LongTensor([self.tag_to_ix[STOP_TAG]])])
score = torch.sum(self.transitions[pad_stop_tags, pad_start_tags]) + torch.sum(feats[r, tags])
return score
def _get_lstm_features(self, sentence, chars2, caps, chars2_length, d):
if self.char_mode == 'LSTM':
# self.char_lstm_hidden = self.init_lstm_hidden(dim=self.char_lstm_dim, bidirection=True, batchsize=chars2.size(0))
chars_embeds = self.char_embeds(chars2).transpose(0, 1)
packed = torch.nn.utils.rnn.pack_padded_sequence(chars_embeds, chars2_length)
lstm_out, _ = self.char_lstm(packed)
outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_out)
outputs = outputs.transpose(0, 1)
chars_embeds_temp = Variable(torch.FloatTensor(torch.zeros((outputs.size(0), outputs.size(2)))))
if self.use_gpu:
chars_embeds_temp = chars_embeds_temp.cuda()
for i, index in enumerate(output_lengths):
chars_embeds_temp[i] =[i, index-1, :self.char_lstm_dim], outputs[i, 0, self.char_lstm_dim:]))
chars_embeds = chars_embeds_temp.clone()
for i in range(chars_embeds.size(0)):
chars_embeds[d[i]] = chars_embeds_temp[i]
if self.char_mode == 'CNN':
chars_embeds = self.char_embeds(chars2).unsqueeze(1)
chars_cnn_out3 = self.char_cnn3(chars_embeds)
chars_embeds = nn.functional.max_pool2d(chars_cnn_out3, kernel_size=(chars_cnn_out3.size(2), 1)).view(chars_cnn_out3.size(0), self.out_channels)
# t = self.hw_gate(chars_embeds)
# g = nn.functional.sigmoid(t)
# h = nn.functional.relu(self.hw_trans(chars_embeds))
# chars_embeds = g * h + (1 - g) * chars_embeds
embeds = self.word_embeds(sentence)
if self.n_cap and self.cap_embedding_dim:
cap_embedding = self.cap_embeds(caps)
if self.n_cap and self.cap_embedding_dim:
embeds =, chars_embeds, cap_embedding), 1)
embeds =, chars_embeds), 1)
embeds = embeds.unsqueeze(1)
embeds = self.dropout(embeds)
lstm_out, _ = self.lstm(embeds)
lstm_out = lstm_out.view(len(sentence), self.hidden_dim*2)
lstm_out = self.dropout(lstm_out)
lstm_feats = self.hidden2tag(lstm_out)
return lstm_feats
def _forward_alg(self, feats):
# calculate in log domain
# feats is len(sentence) * tagset_size
# initialize alpha with a Tensor with values all equal to -10000.
init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
forward_var = autograd.Variable(init_alphas)
if self.use_gpu:
forward_var = forward_var.cuda()
for feat in feats:
emit_score = feat.view(-1, 1)
tag_var = forward_var + self.transitions + emit_score
max_tag_var, _ = torch.max(tag_var, dim=1)
tag_var = tag_var - max_tag_var.view(-1, 1)
forward_var = max_tag_var + torch.log(torch.sum(torch.exp(tag_var), dim=1)).view(1, -1) # ).view(1, -1)
terminal_var = (forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]).view(1, -1)
alpha = log_sum_exp(terminal_var)
# Z(x)
return alpha
def viterbi_decode(self, feats):
backpointers = []
# analogous to forward
init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
init_vvars[0][self.tag_to_ix[START_TAG]] = 0
forward_var = Variable(init_vvars)
if self.use_gpu:
forward_var = forward_var.cuda()
for feat in feats:
next_tag_var = forward_var.view(1, -1).expand(self.tagset_size, self.tagset_size) + self.transitions
_, bptrs_t = torch.max(next_tag_var, dim=1)
bptrs_t = bptrs_t.squeeze().data.cpu().numpy()
next_tag_var =
viterbivars_t = next_tag_var[range(len(bptrs_t)), bptrs_t]
viterbivars_t = Variable(torch.FloatTensor(viterbivars_t))
if self.use_gpu:
viterbivars_t = viterbivars_t.cuda()
forward_var = viterbivars_t + feat
terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]][self.tag_to_ix[STOP_TAG]] = -10000.[self.tag_to_ix[START_TAG]] = -10000.
best_tag_id = argmax(terminal_var.unsqueeze(0))
path_score = terminal_var[best_tag_id]
best_path = [best_tag_id]
for bptrs_t in reversed(backpointers):
best_tag_id = bptrs_t[best_tag_id]
start = best_path.pop()
assert start == self.tag_to_ix[START_TAG]
return path_score, best_path
def neg_log_likelihood(self, sentence, tags, chars2, caps, chars2_length, d):
# sentence, tags is a list of ints
# features is a 2D tensor, len(sentence) * self.tagset_size
feats = self._get_lstm_features(sentence, chars2, caps, chars2_length, d)
if self.use_crf:
forward_score = self._forward_alg(feats)
gold_score = self._score_sentence(feats, tags)
return forward_score - gold_score
tags = Variable(tags)
scores = nn.functional.cross_entropy(feats, tags)
return scores
def forward(self, sentence, chars, caps, chars2_length, d):
feats = self._get_lstm_features(sentence, chars, caps, chars2_length, d)
# viterbi to get tag_seq
if self.use_crf:
score, tag_seq = self.viterbi_decode(feats)
score, tag_seq = torch.max(feats, 1)
tag_seq = list(tag_seq.cpu().data)
return score, tag_seq
create Model Instance:-
model_fp32 = BiLSTM_CRF(vocab_size=len(word_to_id),
Apply Quantization
model_int8 = torch.quantization.quantize_dynamic(
model_fp32, # the original model
{nn.LSTM,nn.Linear}, # a set of layers to dynamically quantize
Checking Quantization Results:
def print_size_of_model(model, label=""):, "temp.p")
print("model: ",label,' \t','Size (KB):', size/1e3)
return size
compare the sizes
print("{0:.2f} times smaller".format(f/q))
model: model_fp32 Size (KB): 806494.996
model: model_int8 Size (KB): 804532.412
1.00 times smaller
is there any way to reduce the model size significantly??
Based on Results section of question and vocab_size of approximately 2 million, it's seems reasonable to quantize attribute word_embeds. Expected that quantization of only this module alone will result in significant drop of memory occupation by weights. According to documentation there is no support for dynamic quantization(which is used for nn.Linear and nn.LSTM in snippet above) of nn.Embedding(type of word_embeds), but static quantization can handle this. Default qconfig which is used in some pytorch examples seems not working on nn.Embedding, but there is a hint in issue discussion how to quantize nn.Embedding. After training:
from torch.quantization.qconfig import float_qparams_weight_only_qconfig
model_fp32.word_embeds.qconfig = float_qparams_weight_only_qconfig
torch.quantization.prepare(model_fp32, inplace=True)
torch.quantization.convert(model_fp32, inplace=True)
And after that word_embeds in model_fp32 will be quantized to torhc.quint8.
I am not good with PyTorch so I would appreciate some help in converting this code to TensorFlow. I have trying going through some articles but it was a bit intensive so a little explanation would also be worthwhile so that the whole community can benefit from this.
import torch
import copy
class PESG(torch.optim.Optimizer):
def __init__(self, model, a=None, b=None, alpha=None, imratio=0.1, margin=1.0, lr=0.1, gamma=500, clip_value=1.0, weight_decay=1e-5, **kwargs):
assert a is not None, 'Found no variable a!'
assert b is not None, 'Found no variable b!'
assert alpha is not None, 'Found no variable alpha!'
self.p = imratio
self.margin = margin
self.model = model = lr
self.gamma = gamma
self.clip_value = clip_value
self.weight_decay = weight_decay
self.a = a
self.b = b
self.alpha = alpha
self.model_ref = []
for var in list(self.model.parameters())+[self.a, self.b]:
self.model_ref.append(torch.empty(var.shape).normal_(mean=0, std=0.01).cuda())
self.model_acc = []
for var in list(self.model.parameters())+[self.a, self.b]:
self.model_acc.append(torch.zeros(var.shape, dtype=torch.float32, device="cuda", requires_grad=False).cuda())
self.T = 0
self.step_counts = 0
def get_parameters(params):
for p in params:
yield p
self.params = get_parameters(list(model.parameters())+[a,b])
self.defaults = dict(,
model_ref = self.model_ref,
model_acc = self.model_acc
super(PESG, self).__init__(self.params, self.defaults)
def optim_steps(self):
return self.step_counts
def update_lr(self, lr):
def step(self):
"""Performs a single optimization step.
for group in self.param_groups:
weight_decay = group['weight_decay']
clip_value = group['clip_value'] = group['lr']
p = group['p']
gamma = group['gamma']
m = group['margin']
model_ref = group['model_ref']
model_acc = group['model_acc']
a = group['a']
b = group['b']
alpha = group['alpha']
# updates
for i, p in enumerate(group['params']):
if p.grad is None:
continue = - group['lr']*( torch.clamp( , -clip_value, clip_value) + 1/gamma*( - model_ref[i].data) ) - group['lr']*weight_decay*
model_acc[i].data = model_acc[i].data + = + group['lr']*(2*(m + -* = torch.clamp(, 0, 999)
self.T += 1
self.step_counts += 1
def zero_grad(self):
self.a.grad = None
self.b.grad = None
self.alpha.grad =None
def update_regularizer(self, decay_factor=None):
if decay_factor != None:
self.param_groups[0]['lr'] = self.param_groups[0]['lr']/decay_factor
print ('Reducing learning rate to %.5f # T=%s!'%(self.param_groups[0]['lr'], self.T))
print ('Updating regularizer # T=%s!'%(self.T))
for i, param in enumerate(self.model_ref):
self.model_ref[i].data = self.model_acc[i].data/self.T
for i, param in enumerate(self.model_acc):
self.model_acc[i].data = torch.zeros(param.shape, dtype=torch.float32, device="cuda", requires_grad=False).cuda()
self.T = 0
I could use a second set of eyes on my neural network.
This is the mnist number recognition project.
I'm not sure where the issue is.
I previously implemented the ai with tensor flow successfully.
I'm not looking to use an api as a solution.
I would appreciate any help anyone can give.
Here's the project on github, it's only an init file and then the neural_network.
class NeuralNetwork(object):
def __init__(self, sizes):
self.activations = []
self.outputs = []
self.weights = []
self.biases = []
self.sizes = sizes
def set_random_weights(self):
for layer_index, layer_size in enumerate(self.sizes[1:], start=1):
layer_weights = []
for size in range(layer_size):
for size in range(self.sizes[layer_index-1]):
layer_weights.append(random.uniform(-5.0, 5.0))
def set_random_biases(self):
total_biases = 0
# add extra zero bias to help future indexing
for index, size in enumerate(self.sizes[0:-1], start=1):
total_biases += 1
for x in range(total_biases):
self.biases.append(random.uniform(-5.0, 5.0))
def train_network(self, training_data, training_labels):
if len(training_data) != len(training_labels):
print("Error data and labels must be the same length")
data = list(zip(training_data, training_labels))
def sgd(self, data, mini_batch_size = 1000):
# first we'll create batches of training data
n = len(data)
data_batches = [
data[k:k + mini_batch_size]
for k in range(0, n, mini_batch_size)
i = 0
for mini_batch in data_batches:
print("Batch: " + str(i))
i += 1
print("Finished All training data!")
def update_mini_batch(self, mini_data_batch):
weight_gradients = []
bias_gradients = []
i = 0
for training_input in mini_data_batch:
training_object, training_label = training_input
weights_gradient, bias_gradient = self.backpropogation(training_label)
# average gradients
weights_gradient = np.average(weight_gradients,axis=0)
biases_gradient = np.average(bias_gradients, axis=0)
# may need to convert to list
weights_gradient_list = []
for weight_gradient in weights_gradient:
#weights_gradient = weights_gradient.tolist()
biases_gradient = biases_gradient.tolist()
for x in range(len(self.biases)):
self.biases[x] -= 0.1*biases_gradient[x]
weight_gradient_index = 0
for layer_index, layer_weights in enumerate(self.weights, start=0):
for weight_index, weight in enumerate(layer_weights):
self.weights[layer_index][weight_index] = weight - 0.1*weights_gradient_list[layer_index][weight_index]
weight_gradient_index += 1
def feedforward(self, training_object):
# set inputs
self.outputs = []
self.activations = []
temp_activations = []
for index in range(self.sizes[0]):
for layer_index, layer_size in enumerate(self.sizes[1:], start=0):
layer_weights = self.weights[layer_index]
layer_inputs = self.activations[layer_index]
weight_index = 0
layer_outputs = []
layer_activations = []
for node_index in range(layer_size):
node_weights = []
# get node weights
#print(f"layer size: {layer_size}, previous_layer_size: {self.sizes[layer_index]}, layer weights: {len(layer_weights)}")
for x in range(self.sizes[layer_index]):
weight_index += 1
output = 0
for indx in range(len(node_weights)):
output += layer_inputs[indx]*node_weights[indx]
output = output + self.biases[layer_index]
def backpropogation(self, training_label):
costs = []
output_layer_activations = self.activations[-1]
output_layer_outputs = self.outputs[-1]
correct_labels = self.translate_label_to_array(training_label)
costs.append(self.compute_cost_derivative(correct_labels, output_layer_activations))
for cost_index, cost in enumerate(costs[0]):
costs[0][cost_index] = cost*self.sigmoid_prime(output_layer_outputs[cost_index])
# calculate costs for layers
for layer_index, layer_size in enumerate(self.sizes[::-1][1:-1], start=1):
layer_costs = []
layer_weights = self.weights[-layer_index]
layer_outputs = self.outputs[-(layer_index+1)]
previous_layer_costs = costs[layer_index-1]
next_layer_size = self.sizes[::-1][1:][layer_index]
layer_weights_formatted = []
for x in range(layer_size):
for weight_index, weight in enumerate(layer_weights, start=0):
#print(f"weight index:{weight_index % next_layer_size} layer_index: {weight_index}")
#print(f"next_layer_size:{layer_size} costs: {len(previous_layer_costs)}, layer_weights_formatted: {layer_weights_formatted}")
for x in range(layer_size):
node_cost = 0
for y, cost in enumerate(previous_layer_costs,start=0):
node_cost += layer_weights_formatted[x][y]*cost
# layer_costs same order as next layer's activations
for cost_index, cost in enumerate(layer_costs):
layer_costs[cost_index] = cost * self.sigmoid_prime(layer_outputs[cost_index])
# calculate weight errors
weight_errors = []
bias_errors = []
for layer_index, layer_costs in enumerate(costs[::-1]):
layer_activations = self.activations[layer_index]
layer_weight_errors = []
for cost_index, cost in enumerate(layer_costs,start=0):
for activation in layer_activations:
layer_weight_errors.append(activation * cost)
return weight_errors, bias_errors
# conversion tool
def translate_label_to_array(self, y):
translated_label = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
translated_label[y] = 1
return np.array(translated_label)
# output tools
def network_outputs(self):
print("Output layer: ")
for x in range(self.sizes[-1]):
print("node " + str(x) + ": " + str(self.activations[-1][x]))
def total_activations(self):
def compute_cost_derivative(self, y, output_activations):
"""Return the vector of partial derivatives \partial C_x /
\partial a for the output activations."""
return (output_activations - y)
def sigmoid(self, z):
""""The sigmoid function."""
return (1.0 / (1.0 + np.exp(-z)))
def sigmoid_prime(self, z):
return (self.sigmoid(z) * (1 - self.sigmoid(z)))
Question: How can I get a latent that was used to generate an image during the projection process of StyleGAN2?
Hello! Am playing around with this StyleGAN2 colab notebook .
It can generate 1024x1024 high res face images and more. What I've tried is to find the generatable face closely resembling Christiano Ronaldo.
Ran their code, worked fine:
Generated Christiano Ronaldo
Then I changed the method that projected Ronaldo to return me the Projector object, ran it again and saved the object in a variable.
Projector class:
# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
# This work is made available under the Nvidia Source Code License-NC.
# To view a copy of this license, visit
import numpy as np
import tensorflow as tf
import dnnlib
import dnnlib.tflib as tflib
from training import misc
class Projector:
def __init__(self):
self.num_steps = 1000
self.dlatent_avg_samples = 10000
self.initial_learning_rate = 0.1
self.initial_noise_factor = 0.05
self.lr_rampdown_length = 0.25
self.lr_rampup_length = 0.05
self.noise_ramp_length = 0.75
self.regularize_noise_weight = 1e5
self.verbose = False
self.clone_net = True
self._Gs = None
self._minibatch_size = None
self._dlatent_avg = None
self._dlatent_std = None
self._noise_vars = None
self._noise_init_op = None
self._noise_normalize_op = None
self._dlatents_var = None
self._noise_in = None
self._dlatents_expr = None
self._images_expr = None
self._target_images_var = None
self._lpips = None
self._dist = None
self._loss = None
self._reg_sizes = None
self._lrate_in = None
self._opt = None
self._opt_step = None
self._cur_step = None
def _info(self, *args):
if self.verbose:
print('Projector:', *args)
def set_network(self, Gs, minibatch_size=1):
assert minibatch_size == 1
self._Gs = Gs
self._minibatch_size = minibatch_size
if self._Gs is None:
if self.clone_net:
self._Gs = self._Gs.clone()
# Find dlatent stats.
self._info('Finding W midpoint and stddev using %d samples...' % self.dlatent_avg_samples)
latent_samples = np.random.RandomState(123).randn(self.dlatent_avg_samples, *self._Gs.input_shapes[0][1:])
dlatent_samples =, None)[:, :1, :] # [N, 1, 512]
self._dlatent_avg = np.mean(dlatent_samples, axis=0, keepdims=True) # [1, 1, 512]
self._dlatent_std = (np.sum((dlatent_samples - self._dlatent_avg) ** 2) / self.dlatent_avg_samples) ** 0.5
self._info('std = %g' % self._dlatent_std)
# Find noise inputs.
self._info('Setting up noise inputs...')
self._noise_vars = []
noise_init_ops = []
noise_normalize_ops = []
while True:
n = 'G_synthesis/noise%d' % len(self._noise_vars)
if not n in self._Gs.vars:
v = self._Gs.vars[n]
noise_init_ops.append(tf.assign(v, tf.random_normal(tf.shape(v), dtype=tf.float32)))
noise_mean = tf.reduce_mean(v)
noise_std = tf.reduce_mean((v - noise_mean)**2)**0.5
noise_normalize_ops.append(tf.assign(v, (v - noise_mean) / noise_std))
self._info(n, v)
self._noise_init_op =*noise_init_ops)
self._noise_normalize_op =*noise_normalize_ops)
# Image output graph.
self._info('Building image output graph...')
self._dlatents_var = tf.Variable(tf.zeros([self._minibatch_size] + list(self._dlatent_avg.shape[1:])), name='dlatents_var')
self._noise_in = tf.placeholder(tf.float32, [], name='noise_in')
dlatents_noise = tf.random.normal(shape=self._dlatents_var.shape) * self._noise_in
self._dlatents_expr = tf.tile(self._dlatents_var + dlatents_noise, [1, self._Gs.components.synthesis.input_shape[1], 1])
self._images_expr = self._Gs.components.synthesis.get_output_for(self._dlatents_expr, randomize_noise=False)
# Downsample image to 256x256 if it's larger than that. VGG was built for 224x224 images.
proc_images_expr = (self._images_expr + 1) * (255 / 2)
sh = proc_images_expr.shape.as_list()
if sh[2] > 256:
factor = sh[2] // 256
proc_images_expr = tf.reduce_mean(tf.reshape(proc_images_expr, [-1, sh[1], sh[2] // factor, factor, sh[2] // factor, factor]), axis=[3,5])
# Loss graph.
self._info('Building loss graph...')
self._target_images_var = tf.Variable(tf.zeros(proc_images_expr.shape), name='target_images_var')
if self._lpips is None:
self._lpips = misc.load_pkl('')
self._dist = self._lpips.get_output_for(proc_images_expr, self._target_images_var)
self._loss = tf.reduce_sum(self._dist)
# Noise regularization graph.
self._info('Building noise regularization graph...')
reg_loss = 0.0
for v in self._noise_vars:
sz = v.shape[2]
while True:
reg_loss += tf.reduce_mean(v * tf.roll(v, shift=1, axis=3))**2 + tf.reduce_mean(v * tf.roll(v, shift=1, axis=2))**2
if sz <= 8:
break # Small enough already
v = tf.reshape(v, [1, 1, sz//2, 2, sz//2, 2]) # Downscale
v = tf.reduce_mean(v, axis=[3, 5])
sz = sz // 2
self._loss += reg_loss * self.regularize_noise_weight
# Optimizer.
self._info('Setting up optimizer...')
self._lrate_in = tf.placeholder(tf.float32, [], name='lrate_in')
self._opt = dnnlib.tflib.Optimizer(learning_rate=self._lrate_in)
self._opt.register_gradients(self._loss, [self._dlatents_var] + self._noise_vars)
self._opt_step = self._opt.apply_updates()
def run(self, target_images):
# Run to completion.
while self._cur_step < self.num_steps:
# Collect results.
pres = dnnlib.EasyDict()
pres.dlatents = self.get_dlatents()
pres.noises = self.get_noises()
pres.images = self.get_images()
return pres
def start(self, target_images):
assert self._Gs is not None
# Prepare target images.
self._info('Preparing target images...')
target_images = np.asarray(target_images, dtype='float32')
target_images = (target_images + 1) * (255 / 2)
sh = target_images.shape
assert sh[0] == self._minibatch_size
if sh[2] > self._target_images_var.shape[2]:
factor = sh[2] // self._target_images_var.shape[2]
target_images = np.reshape(target_images, [-1, sh[1], sh[2] // factor, factor, sh[3] // factor, factor]).mean((3, 5))
# Initialize optimization state.
self._info('Initializing optimization state...')
tflib.set_vars({self._target_images_var: target_images, self._dlatents_var: np.tile(self._dlatent_avg, [self._minibatch_size, 1, 1])})
self._cur_step = 0
def step(self):
assert self._cur_step is not None
if self._cur_step >= self.num_steps:
if self._cur_step == 0:
# Hyperparameters.
t = self._cur_step / self.num_steps
noise_strength = self._dlatent_std * self.initial_noise_factor * max(0.0, 1.0 - t / self.noise_ramp_length) ** 2
lr_ramp = min(1.0, (1.0 - t) / self.lr_rampdown_length)
lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi)
lr_ramp = lr_ramp * min(1.0, t / self.lr_rampup_length)
learning_rate = self.initial_learning_rate * lr_ramp
# Train.
feed_dict = {self._noise_in: noise_strength, self._lrate_in: learning_rate}
_, dist_value, loss_value =[self._opt_step, self._dist, self._loss], feed_dict)
# Print status.
self._cur_step += 1
if self._cur_step == self.num_steps or self._cur_step % 10 == 0:
self._info('%-8d%-12g%-12g' % (self._cur_step, dist_value, loss_value))
if self._cur_step == self.num_steps:
def get_cur_step(self):
return self._cur_step
def get_dlatents(self):
return, {self._noise_in: 0})
def get_noises(self):
def get_images(self):
return, {self._noise_in: 0})
I got that object, called the get_dlatents method, thinking this is the input latent that produced Christiano.
Generating an image with that latent, clearly not near to Ronaldo ("proji" is the Projector object)
latents = proji.get_dlatents()
latent = latents[0][17]
latent = np.reshape(latent, (1,512))
img = generate_images([latent],1.0)[0]
Result: this was supposed to be Ronaldo
I do not know if I made a thinking or coding mistake, all I want to know is: How can I get a latent that was used to generate an image during the projection process?
In order to understand it, you probably need to check the colab notebook yourself, didn't want to paste everything here tho.
Thanks for taking your time to look at this.
I don't know if your question is still relevant but what you are looking is projecting an image to the latent space. This github page is clean and precise. After settin up the environment, in 2 steps you can get your latents.
To extract and align faces from images: python raw_images/ aligned_images/ and to find latent representation of aligned images use python aligned_images/ generated_images/ latent_representations/. Under latent_representations folder you'll have your latents. Now you can use these latents to generate your desired faces. Good luck.
I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
# initialize the variance for OU process for exploring policies
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
# compute TD error i.e actual - predicted values
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
# train the critic network with adam optimizer
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
# saver
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a =, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:], {self.state: batch_states}), {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
def build_actor_network(self, s, scope, trainable):
# Actor DPG
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
ep_reward = 0
for j in range(epsiode_steps):
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 150:
if j == epsiode_steps - 1:
print('Episode:', i, ' Reward: %i' % int(ep_reward))
I solved this problem completely by rewriting the code and adding the learning function in a separate session