Related
I have implemented and trained a neural network in Pytorch, however, I am interested in the derivative of the neural network parameters with respect to the input.
I have extensively searched for any procedure to that would allow evaluating the derivative of weights with respect to a given input, but I did not find anything.
I know that I can compute the gradients of a function in the following way.
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)
But How would I do that with a trained neural network instead of a function Q?
Thanks in advance.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
# In[2]:
import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
from torch import optim
# In[3]:
nSimul = 32768
T1 = 1.0
T2 = 2.0
K = 110.0
spot = 100.0
vol = 0.2
vol0 = 0.5 # vol is increased over the 1st period so we have more points in the wings
# simulate all Gaussian returns (N1, N2) first
# returns: matrix of shape [nSimul, TimeSteps=2]
returns = np.random.normal(size=[nSimul,2])
# generate paths, step by step, and not path by path as customary
# this is to avoid slow Python loops, using NumPy's optimized vector functions instead
# generate the vector of all scenarios for S1, of shape [nSimul]
S1 = spot * np.exp(-0.5*vol0*vol0*T1 + vol0*np.sqrt(T1)*returns[:,0])
# generate the vector of all scenarios for S2, of shape [nSimul]
S2 = S1 * np.exp(-0.5*vol*vol*(T2-T1) + vol*np.sqrt(T2-T1)*returns[:,1])
# training set, X and Y are both vectors of shape [nSimul]
X = S1
Y = np.maximum(0, S2 - K)
xAxis = np.linspace(20, 200, 100)
xAxis=xAxis.reshape(-1,1)
# In[4]:
#Normalization of the simulated data:
meanX = np.mean(X)
stdX = np.std(X)
meanY = np.mean(Y)
stdY = np.std(Y)
normX = (X - meanX) / stdX
normY = (Y - meanY) / stdY
normX=normX.reshape(-1,1)
normY=normY.reshape(-1,1)
# In[5]:
class NeuralNetwork(nn.Module):
def __init__(self,inputsize,outputsize):
super(NeuralNetwork, self).__init__()
#self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(inputsize,3),
nn.ELU(),
nn.Linear(3, 5),
nn.ELU(),
nn.Linear(5,3),
nn.ELU(),
nn.Linear(3,outputsize),
)
w = torch.empty(0,1)
nn.init.normal_(w)
def forward(self, x):
#x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
# In[6]:
inputDim = 1 # takes variable 'x'
outputDim = 1 # takes variable 'y'
learningRate = 0.05
epochs = 10000
#weight=torch.empty(3)
model = NeuralNetwork(inputDim, outputDim)
##### For GPU #######
if torch.cuda.is_available():
model.cuda()
# In[7]:
#criterion = torch.nn.MSELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)
# In[ ]:
def ridge_loss(outputs,labels):
torch.mean((outputs-labels)**2)
# In[ ]:
# In[9]:
#Adam optmization
criterion = torch.nn.MSELoss()
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.05)
# In[10]:
for epoch in range(epochs):
# Converting inputs and labels to Variable
if torch.cuda.is_available():
inputs = Variable(torch.from_numpy(normX).cuda().float())
labels = Variable(torch.from_numpy(normY).cuda().float())
else:
inputs = Variable(torch.from_numpy(normX).float())
labels = Variable(torch.from_numpy(normY).float())
# Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
optimizer.zero_grad()
# get output from the model, given the inputs
outputs = model(inputs)
# get loss for the predicted output
loss = criterion(outputs, labels)
print(loss)
# get gradients w.r.t to parameters
loss.backward()
# update parameters
optimizer.step()
print('epoch {}, loss {}'.format(epoch, loss.item()))
# In[11]:
def predict(xs):
# first, normalize
nxs = (xs - meanX) / stdX
# forward feed through ANN
# we don't need gradients in the testing phase
with torch.no_grad():
if torch.cuda.is_available():
nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
else:
nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
# de-normalize output
ys = meanY + stdY * nys
# we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
return np.reshape(ys, [-1])
# In[13]:
def BlackScholes(S0,r,sigma,T,K):
d1 = 1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
d2 = d1 - sigma * np.sqrt(T)
return norm.cdf(d1) * S0 - norm.cdf(d2) * K * np.exp(-r*T)
def BlackScholesCallDelta(S0,r,sigma,T,K):
d1 = 1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
return norm.cdf(d1)
BlackScholes_vec=np.vectorize(BlackScholes)
BlackScholesCallDelta_vec=np.vectorize(BlackScholesCallDelta)
# In[14]:
BS_price=BS_prices=BlackScholes_vec(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)
S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis,predicted,label="Neural Regression")
line_BS = plt.plot(xAxis,BS_price, label="Black-Scholes")
plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")
# In[15]:
Prices_rg_mc_diff=[]
for i in range(len(xAxis)-1):
delta=(predicted[i+1]-predicted[i])/(xAxis[i+1]-xAxis[i])
Prices_rg_mc_diff.append(delta)
# In[16]:
BS_delta=BlackScholesCallDelta(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)
S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis[1:],Prices_rg_mc_diff,label="Neural Regression")
line_BS = plt.plot(xAxis[1:],BS_delta[1:], label="Black-Scholes")
plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")
# In[17]:
model.backward(retain_graph=True)
# In[ ]:
print(NeuralNetwork.weight.grad)
# In[ ]:
def predict(xs):
# first, normalize
nxs = (xs - meanX) / stdX
# forward feed through ANN
# we don't need gradients in the testing phase
with torch.no_grad():
if torch.cuda.is_available():
nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
else:
nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
# de-normalize output
ys = meanY + stdY * nys
# we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
return np.reshape(ys, [-1])
# In[21]:
c3=torch.from_numpy((predicted.reshape(-1,1)), requires_grad=True)
c4=torch.from_numpy(xAxis, requires_grad=True)
#c5=torch.Tensor(c3)
#c6=torch.Tensor(c4)
loss = criterion(c3,c4) # calculating loss
loss.backward()
# In[28]:
torch.tensor(predicted.reshape(-1,1), requires_grad=True)
torch.tensor(xAxis, requires_grad=True)
criterion(torch.tensor(predicted.reshape(-1,1), requires_grad=True),torch.tensor(xAxis, requires_grad=True))
loss.backward()
You need to explicitly use requires_grad = True when create a tensor. And to calculate gradient you first need to apply some operation on the tensor.
Here is an example:
import torch
x = torch.rand(2, 2, requires_grad=True)
y = x + 2
z = y * y * 3
out = z.mean()
out.backward()
print(x.grad)
Output:
tensor([[3.3720, 3.4302],
[3.4030, 3.3605]])
In this way you are using torch.autograd to calculate the gradient for tensor x. See autograd for more.
And for neural network you can simply use the network and backward it afterward.
A neural network Example:
import torch
import torch.nn as nn
import torch.nn.functional as f
x = torch.rand(2, 2)
# define a neural network
network = nn.Sequential(
nn.Linear(2,100),
nn.Linear(100,2)
)
pred = network(x)
loss = f.mae_loss(pred, x) # calculating loss
loss.backward()
# Update weights with gradients
network[0].weight = 0.1 * network[0].weight.grad
network[1].weight = 0.1 * network[1].weight.grad
Note: I didn't put any activation function in network for the sack of simplicity.
Example of backward() using torch.nn.MSELoss():
import torch
from torch.nn import MSELoss
criterion = MSELoss()
a = torch.tensor([1.,2.], requires_grad=True)
b = a**2
loss = criterion(b, a)
loss.backward()
print(a.grad)
Output:
tensor([0., 6.])
I am trying to train a model to estimate a GMM. However, the means of the GMM are calculated each time based on a mean_placement parameter. I am following the solution provided here, I'll copy and paste the original code:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
import torch
from torch import nn
from torch import optim
import torch.distributions as D
num_layers = 8
weights = torch.ones(8,requires_grad=True)
means = torch.tensor(np.random.randn(8,2),requires_grad=True)
stdevs = torch.tensor(np.abs(np.random.randn(8,2)),requires_grad=True)
parameters = [weights, means, stdevs]
optimizer1 = optim.SGD(parameters, lr=0.001, momentum=0.9)
num_iter = 10001
for i in range(num_iter):
mix = D.Categorical(weights)
comp = D.Independent(D.Normal(means,stdevs), 1)
gmm = D.MixtureSameFamily(mix, comp)
optimizer1.zero_grad()
x = torch.randn(5000,2)#this can be an arbitrary x samples
loss2 = -gmm.log_prob(x).mean()#-densityflow.log_prob(inputs=x).mean()
loss2.backward()
optimizer1.step()
print(i, loss2)
What I would like to do is this:
num_layers = 8
weights = torch.ones(8,requires_grad=True)
means_coef = torch.tensor(10.,requires_grad=True)
means = torch.tensor(torch.dstack([torch.linspace(1,means_coef.detach().item(),8)]*2).squeeze(),requires_grad=True)
stdevs = torch.tensor(np.abs(np.random.randn(8,2)),requires_grad=True)
parameters = [means_coef]
optimizer1 = optim.SGD(parameters, lr=0.001, momentum=0.9)
num_iter = 10001
for i in range(num_iter):
means = torch.tensor(torch.dstack([torch.linspace(1,means_coef.detach().item(),8)]*2).squeeze(),requires_grad=True)
mix = D.Categorical(weights)
comp = D.Independent(D.Normal(means,stdevs), 1)
gmm = D.MixtureSameFamily(mix, comp)
optimizer1.zero_grad()
x = torch.randn(5000,2)#this can be an arbitrary x samples
loss2 = -gmm.log_prob(x).mean()#-densityflow.log_prob(inputs=x).mean()
loss2.backward()
optimizer1.step()
print(i, means_coef)
print(means_coef)
However in this case the parameter is not updated and the grad value is always None. Any ideas how to fix this?
According to your instructions I have re-written your model.
If you run it you can see that all the parameters are changing after the model is optimized. I also have provided the graph of the model at the end. You can simply modify the GMM class as you need if you want to make a new one.
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
import torch
from torch import nn
from torch import optim
import torch.distributions as D
class GMM(nn.Module):
def __init__(self, weights, base, scale, n_cell=8, shift=0, dim=2):
super(GMM, self).__init__()
self.weight = nn.Parameter(weights)
self.base = nn.Parameter(base)
self.scale = nn.Parameter(scale)
self.grid = torch.arange(1, n_cell+1)
self.shift = shift
self.n_cell = n_cell
self.dim = dim
def trsf_grid(self):
trsf = (
torch.log(self.scale * self.grid + self.shift)
/ torch.log(self.base)
).reshape(-1, 1)
return trsf.expand(self.n_cell, self.dim)
def forward(self, x, std):
means = self.trsf_grid()
mix = D.Categorical(self.weight)
comp = D.Independent(D.Normal(means, std), 1)
gmm = D.MixtureSameFamily(mix, comp)
return -gmm.log_prob(x).mean()
if __name__ == "__main__":
weight = torch.ones(8)
base = torch.tensor(3.)
scale = torch.tensor(1.)
stds = torch.tensor(np.abs(np.random.randn(8,2)),requires_grad=False)
model = GMM(weight, base, scale)
print(list(model.parameters()))
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for i in range(1000):
optimizer.zero_grad()
x = torch.randn(5000,2)
loss = model(x, stds)
loss.backward()
optimizer.step()
print(list(model.parameters()))
In my case It returned the following parameters:
[Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True), Parameter containing:
tensor(3., requires_grad=True), Parameter containing:
tensor(1., requires_grad=True)]
[Parameter containing:
tensor([0.7872, 1.1010, 1.3390, 1.3757, 0.5122, 0.2884, 1.2597, 0.7597],
requires_grad=True), Parameter containing:
tensor(3.3207, requires_grad=True), Parameter containing:
tensor(0.2814, requires_grad=True)]
which indeed shows that the parameters are updating.
Also you can see the computation graph below:
I have the following code that uses tensorflow to calculate a custom average loss when the image is consistently rotated:
import tensorflow as tf
import cv2
#initialize x_hat
img = cv2.imread("4.jpg")
x_hat = tf.Variable(img,name = 'x_hat') #img we want to attack
#tf.function
def cost2():
image=x_hat
#Now it will generate 100 samples rotated
num_samples = 100
average_loss = 0
for j in range(num_samples):
#ADD ROTATION (there may be a problem here)
rotated = tf.keras.preprocessing.image.random_rotation(image,
tf.random.uniform(shape=(),minval=40, maxval=90),channel_axis=2)
#get logits
rotated_logits, _ = resnet(rotated)
#get average CUSTOM loss
average_loss+=-1 * tf.nn.softmax_cross_entropy_with_logits(logits=rotated_logits, labels=labels)/ num_samples
return average_loss
and here is how I call it
learning_rate = 1e-1
optim = tf.optimizers.SGD (learning_rate=learning_rate)
epsilon = 2.0/255.0 # a really small perturbation
below = x - epsilon
above = x + epsilon
demo_steps = 200
# projected gradient descent
for i in range(demo_steps):
loss = optim.minimize(cost2, var_list=[x_hat])
if (i+1) % 10 == 0:
print('step %d, loss=%g' % (i+1, loss.numpy()))
projected = tf.clip_by_value(tf.clip_by_value(x_hat, below, above), 0, 1)
with tf.control_dependencies([projected]):
x_hat.assign(projected)
adv_robust = x_hat.numpy()
However, the following error returns to me once I run the code:
TypeError: in user code:
<ipython-input-183-abde02909da7>:14 cost2 *
rotated = tf.keras.preprocessing.image.random_rotation(image,
tf.random.uniform(shape=(),minval=40, maxval=90),channel_axis=2)
/home/me/.local/lib/python3.8/site-
packages/keras_preprocessing/image/affine_transformations.py:55 random_rotation *
theta = np.random.uniform(-rg, rg)
mtrand.pyx:1111 numpy.random.mtrand.RandomState.uniform **
TypeError: __array__() takes 1 positional argument but 2 were given
I am on Tensorflow 2.4.0 and the random_rotation and random.uniform functions are correct according to the TF 2.4.0 documentation HERE and HERE. So, what am I missing here?
The error might be coming from using TF tensors. As stated in the docs you linked regarding random_rotation:
Performs a random rotation of a Numpy image tensor.
Meaning you cannot use TF tensors with this operation. If you are in eager execution mode you can use tensor.numpy():
import tensorflow as tf
image = tf.random.normal((180, 180, 3))
rotated = tf.keras.preprocessing.image.random_rotation(image.numpy(),
tf.random.uniform(shape=(),minval=40, maxval=90).numpy(),channel_axis=2)
Otherwise, it is recommended to use the preprocessing layer: tf.keras.layers.RandomRotation, since using numpy in graph mode (for example in a function decorated with #tf.function) is not recommended.
Here is an example using the tf.keras.layers.RandomRotation:
import tensorflow as tf
import os
import matplotlib.pyplot as plt
_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
path_to_zip = tf.keras.utils.get_file('cats_and_dogs.zip', origin=_URL, extract=True)
PATH = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered')
train_dir = os.path.join(PATH, 'train')
validation_dir = os.path.join(PATH, 'validation')
BATCH_SIZE = 1
IMG_SIZE = (160, 160)
train_ds = tf.keras.utils.image_dataset_from_directory(train_dir,
shuffle=True,
batch_size=BATCH_SIZE,
image_size=IMG_SIZE)
data_augmentation = tf.keras.Sequential([
tf.keras.layers.RandomRotation(tf.random.uniform(shape=(),minval=40, maxval=90)),
])
for image, _ in train_ds.take(1):
plt.figure(figsize=(10, 10))
first_image = image[0]
for i in range(9):
ax = plt.subplot(3, 3, i + 1)
augmented_image = data_augmentation(tf.expand_dims(first_image, 0), training=True)
plt.imshow(augmented_image[0] / 255)
plt.axis('off')
I got the following warning:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:95: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
"Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
For the following code:
import random
import numpy as np
import os
from pathlib import Path
import tensorflow as tf
from tensorflow.contrib.rnn import HighwayWrapper, LSTMCell, DropoutWrapper
from tensorflow.python.ops import array_ops
from socket import gethostname
import os
from pathlib import Path
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
import json
import tensorflow as tf
import time
import argparse
from pathlib import Path
import socket
import os
import datetime
from google.colab import drive
print(print(tf.__version__))
def orthonorm(shape, dtype=tf.float32, # TODO only works for square (recurrent) weights
partition_info=None): # pylint: disable=unused-argument
"""Variable initializer that produces a random orthonormal matrix."""
if len(shape) != 2 or shape[0] != shape[1]:
raise ValueError("Expecting square shape, got %s" % shape)
_, u, _ = tf.svd(tf.random_normal(shape, dtype=dtype), full_matrices=True)
return u
def _reverse(input_, seq_lengths, seq_dim, batch_dim): # reverses sequences with right-padding correctly
return array_ops.reverse_sequence(
input=input_, seq_lengths=seq_lengths,
seq_dim=seq_dim, batch_dim=batch_dim)
def bilstms_interleaved(inputs, num_layers, size, keep_prob, lengths):
outputs = inputs
print('interleaved')
for layer in range(num_layers):
direction = 'backw.' if layer % 2 else 'forw.'
print('Layer {}: Creating {} LSTM'.format(layer, direction)) # backwards if layer odd
with tf.variable_scope('{}_lstm_{}'.format(direction, layer)):
# cell
cell = HighwayWrapper(DropoutWrapper(LSTMCell(size),
variational_recurrent=True,
dtype=tf.float32,
state_keep_prob=keep_prob))
# calc either bw or fw - interleaving is done at graph construction (not runtime)
if direction == 'backw.':
outputs_reverse = _reverse(outputs, seq_lengths=lengths, seq_dim=1, batch_dim=0)
tmp, _ = tf.nn.dynamic_rnn(cell=cell,
inputs=outputs_reverse,
sequence_length=lengths,
dtype=tf.float32)
outputs = _reverse(tmp, seq_lengths=lengths, seq_dim=1, batch_dim=0)
else:
outputs, _ = tf.nn.dynamic_rnn(cell=cell,
inputs=outputs,
sequence_length=lengths,
dtype=tf.float32)
return outputs
class Model():
def __init__(self, config, embeddings, num_labels, g):
# embedding
with tf.device('/cpu:0'):
self.word_ids = tf.placeholder(tf.int32, [None, None], name='word_ids')
embedded = tf.nn.embedding_lookup(embeddings, self.word_ids, name='embedded')
# stacked bilstm
with tf.device('/gpu:0'):
self.predicate_ids = tf.placeholder(tf.float32, [None, None], name='predicate_ids')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.lengths = tf.placeholder(tf.int32, [None], name='lengths')
inputs = tf.concat([embedded, tf.expand_dims(self.predicate_ids, -1)], axis=2, name='lstm_inputs')
final_outputs = bilstms_interleaved(inputs,
config.num_layers,
config.cell_size,
self.keep_prob,
self.lengths)
# projection
shape0 = tf.shape(final_outputs)[0] * tf.shape(final_outputs)[1] # both batch_size and seq_len are dynamic
final_outputs_2d = tf.reshape(final_outputs, [shape0, config.cell_size], name='final_outputs_2d')
wy = tf.get_variable('Wy', [config.cell_size, num_labels])
by = tf.get_variable('by', [num_labels])
logits = tf.nn.xw_plus_b(final_outputs_2d, wy, by, name='logits') # need [shape0, num_labels]
# loss
self.label_ids = tf.placeholder(tf.int32, [None, None], name='label_ids') # [batch_size, max_seq_len]
label_ids_flat = tf.reshape(self.label_ids, [-1]) # need [shape0]
mask = tf.greater(label_ids_flat, 0, 'mask')
self.nonzero_label_ids_flat = tf.boolean_mask(label_ids_flat, mask,
name='nonzero_label_ids_flat') # removes elements
nonzero_logits = tf.boolean_mask(logits, mask, name='nonzero_logits')
nonzero_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=nonzero_logits,
labels=self.nonzero_label_ids_flat,
name='nonzero_losses')
self.nonzero_mean_loss = tf.reduce_mean(nonzero_losses, name='nonzero_mean_loss')
# update
optimizer = tf.train.AdadeltaOptimizer(learning_rate=config.learning_rate, rho=0.95,epsilon=config.epsilon)
gradients, variables = zip(*optimizer.compute_gradients(self.nonzero_mean_loss))
gradients, _ = tf.clip_by_global_norm(gradients, config.max_grad_norm)
self.update = optimizer.apply_gradients(zip(gradients, variables), name='update')
# predictions
self.nonzero_predicted_label_ids = tf.cast(tf.argmax(tf.nn.softmax(nonzero_logits), axis=1), tf.int32,
name='nonzero_predicted_label_ids')
# tensorboard
tf.summary.scalar('nonzero_accuracy', tf.reduce_mean(tf.cast(tf.equal(self.nonzero_predicted_label_ids,
self.nonzero_label_ids_flat),
tf.float32)))
tf.summary.scalar('nonzero_mean_xe', self.nonzero_mean_loss)
self.scalar_summaries = tf.summary.merge_all()
p = Path("/content/drive/My Drive/DLSRL/Tensorboard log")
self.train_writer = tf.summary.FileWriter(str(p), g)
# confusion matrix
nonzero_cm = tf.confusion_matrix(self.nonzero_label_ids_flat, self.nonzero_predicted_label_ids)
size = tf.shape(nonzero_cm)[0]
self.cm_summary = tf.summary.image('nonzero_cm', tf.reshape(tf.cast(nonzero_cm, tf.float32),
[1, size, size, 1])) # needs 4d
I'm trying to implement semantic role labeling using tensorflow . when running the model i get this error and bad performance although there is no need to tune hyper parameter because they were stated in the paper , and when checking tensor-board graph model is build right. another problem is that i have error when loading check point and error is related to not being able to find/load IndexedSlices. So i thought that this warning is causing problem to the model ,but i'am unable to figure it out why. How should I modify the code to make it work and remove that warning?
I am trying to create an embedding for 1,000,000 words on tensorflow. Each word will have a 256 float32 vector representing the word. The issue is that I keep running out of memory. This does not make sence to me since I have 8GB of memory on my GTX 1080. The embedding should only take up 1e6 * 256 * 4 = 1 Gb of memory. I also have another matrix on the output which is of the same size. Other than that there are a few other tensors which should be small in comparison. Therefore I only see about 2 - 3 GB worth of memory needed to store the model and it is failing when I call sess.run(tf.initialize_all_variables()). Where is all my memory going and do you have any advice for how I can get around this?
import tensorflow as tf
import nltk
import numpy as np
import os
import multiprocessing
import itertools
import pickle
from unidecode import unidecode
BATCH_SIZE = 32
TIME_STEPS = 64
WORD_VEC_SIZE = 256
words, training_data = pickle.load(open('vocab.pickle', 'rb'))
word2index = {w:i for i, w in enumerate(words)}
index2word = {i:w for i, w in enumerate(words)}
input_tensor = tf.placeholder(tf.int32, (BATCH_SIZE, TIME_STEPS + 1), 'input_tensor')
embedding = tf.Variable(tf.random_uniform((len(words), WORD_VEC_SIZE), -1, 1), name = 'embedding')
rnn = tf.nn.rnn_cell.BasicRNNCell(WORD_VEC_SIZE)
state = tf.zeros((BATCH_SIZE, rnn.state_size))
input_vectors = tf.nn.embedding_lookup([embedding], input_tensor[:, :TIME_STEPS])
cost = 0
with tf.variable_scope('rnn') as scope:
W_out = tf.get_variable('W_out', (WORD_VEC_SIZE, len(words)), initializer = tf.truncated_normal_initializer(0.0, 1 / np.sqrt(WORD_VEC_SIZE)))
b_out = tf.get_variable('b_out', (len(words), ), initializer = tf.truncated_normal_initializer(0.0, 0.01))
for t in range(TIME_STEPS):
y, state = rnn(tf.reshape(input_vectors[:, t, :], (-1, WORD_VEC_SIZE)), state)
cost += tf.reduce_mean(tf.nn.sampled_softmax_loss(W_out, b_out, y, tf.reshape(input_tensor[:, t + 1], (-1, 1)), 1000, len(words)))
scope.reuse_variables()
train_step = tf.train.AdamOptimizer(1e-4).minimize(cost)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
saver = tf.train.Saver()
What I was not accounting for was the AdamOptimizer. I forgot that this needs to store various parameters for each of the weights in my model. When I changed to a GraidentDecent optimizer it now fits on my GPU.