I want to create heaviside step function in TensorFlow. Since Heaviside function is not differentiable I also need to choose derivative approximation and define custom gradient so full implementation looks like this:
import tensorflow as tf
#tf.RegisterGradient("HeavisideGrad")
def _heaviside_grad(unused_op: tf.Operation, grad: tf.Tensor):
x = unused_op.inputs[0]
# During backpropagation heaviside behaves like sigmoid
return tf.sigmoid(x) * (1 - tf.sigmoid(x)) * grad
def heaviside(x: tf.Tensor, g: tf.Graph = tf.get_default_graph()):
custom_grads = {
"Sign": "HeavisideGrad"
}
with g.gradient_override_map(custom_grads):
# TODO: heaviside(0) currently returns 0. We need heaviside(0) = 1
sign = tf.sign(x)
# tf.stop_gradient is needed to exclude tf.maximum from derivative
step_func = sign + tf.stop_gradient(tf.maximum(0.0, sign) - sign)
return step_func
There is one caveat in my implementation: tf.sign(0) returns zero value so heaviside(0) also returns zero and I want heaviside(0) to return 1. How can I achieve such behavior?
A very hacky way would be to use
1 - max(0.0, sign(-x))
as your step function instead of
max(0.0, sign(x))
Another option would be to use greater_equal and cast the result to your desired type, and override its gradient with the sigmoid override you already have.
Ok, I think I figured it out. Many thanks to etarion who pointed out the correct approach to solve my issue.
So the basic idea is to use tf.greater_equal instead of combination of tf.sign and maximum. The custom gradient is applied to tf.identity operation.
Here is updated implementation of heaviside function:
import tensorflow as tf
#tf.RegisterGradient("HeavisideGrad")
def _heaviside_grad(unused_op: tf.Operation, grad: tf.Tensor):
return tf.maximum(0.0, 1.0 - tf.abs(unused_op.inputs[0])) * grad
def heaviside(x: tf.Tensor, g: tf.Graph = tf.get_default_graph()):
custom_grads = {
"Identity": "HeavisideGrad"
}
with g.gradient_override_map(custom_grads):
i = tf.identity(x, name="identity_" + str(uuid.uuid1()))
ge = tf.greater_equal(x, 0, name="ge_" + str(uuid.uuid1()))
# tf.stop_gradient is needed to exclude tf.to_float from derivative
step_func = i + tf.stop_gradient(tf.to_float(ge) - i)
return step_func
This would make the unit step function, using only TensorFlow APIs so the result is still a tensor:
#in Eager mode
def heaviside(v):
return 1-tf.reduce_max(tf.constant([0,-tf.sign(v).numpy()], tf.float32));
In TensorFlow 2, use the decorator #tf.custom_gradient better:
#tf.custom_gradient
def heaviside(X):
#This custom op is converted to graph, no 'if', 'else' allowed,
#so use 'tf.cond'
List = [];
for I in range(BSIZE): #Batch size
Item = tf.cond(X[I]<0, lambda: tf.constant([0], tf.float32),
lambda: tf.constant([1], tf.float32));
List.append(Item);
U = tf.stack(List);
#Heaviside half-maximum formula
#U = (tf.sign(X)+1)/2;
#Div is differentiation intermediate value
def grad(Div):
return Div*1; #Heaviside has no gradient, use 1.
return U,grad;
Easiest fix for you code is to add a small number to the result of tf.sign() and take the sign again. This will result in getting a 1 for 0:
sign = tf.sign ( tf.sign( x ) + 0.1 )
Related
Part 1
Im going through this article and wanted to try and calculate a forward and backward pass with batch normalization.
When doing the steps after the first layer I get a batch norm output that are equal for all features.
Here is the code (I have on purpose done it in very small steps):
w = np.array([[0.3, 0.4],[0.5,0.1],[0.2,0.3]])
X = np.array([[0.7,0.1],[0.3,0.8],[0.4,0.6]])
def mu(x,axis=0):
return np.mean(x,axis=axis)
def sigma(z, mu):
Ai = np.sum(z,axis=0)
return np.sqrt((1/len(Ai)) * (Ai-mu)**2)
def Ai(z):
return np.sum(z,axis=0)
def norm(Ai,mu,sigma):
return (Ai-mu)/sigma
z1 = np.dot(w1,X.T)
mu1 = mu(z1)
A1 = Ai(z1)
sigma1 = sigma(z1,mu1)
gamma1 = np.ones(len(A1))
beta1 = np.zeros(len(A1))
Ahat = norm(A1,mu1,sigma1) #since gamma is just ones it does change anything here
The output I get from this is:
[1.73205081 1.73205081 1.73205081]
Part 2
In this image:
Should the sigma_mov and mu_mov be set to zero for the first layer?
EDIT: I think I found what I did wrong. In the normalization step I used A1 and not z1. Also I think I found that its normal to use initlize moving average with zeros for mean and ones for variance. Nice if anyone can confirm this.
I wonder how to calculate higher order gradients through tf.py_function in tf2.0. The following example (slightly modified from tensorflow doc) produces the correct dy_dx, and aa_x is None. Thank you.
import tensorflow as tf
import os
def huber(x, delta):
if tf.abs(x) <= delta:
return x*x/ (2*delta)
else:
return tf.abs(x)-delta/2.0
x = tf.constant ([2.0 ] )
z = tf.constant ([1.0 ] )
with tf.GradientTape (persistent=True) as g0:
g0.watch(x)
with tf.GradientTape (persistent=True) as g :
g.watch (x)
y = tf.py_function(func=huber, inp=[x, 3.] , Tout=tf.float32 )
dy_dx = g.gradient(y, x)
aa = tf.reduce_sum(dy_dx *z )
aa_x = g0.gradient (aa, x)
print (dy_dx)
print (aa_x)
Based on the documentation of tf.py_function you cannot compute the higher >1st derivative. This function allows expressing computations in a TensorFlow graph as Python functions. In particular, it wraps a Python function func in a once-differentiable TensorFlow operation that executes it with eager execution enabled. Meaning you can only differentiate it once.
If you want to get higher-order derivatives, you can just use gradient function normally in Tensorflow 2.1.0.
Modified Code:
import tensorflow as tf # Tensorflow 2.1.0
import os
def huber(x, delta):
if tf.abs(x) <= delta:
return x*x/ (2*delta) ## x^2 / 2*delta
## x / delta - 1st derivative
## 1 / delta - 2nd derivative
else:
return tf.abs(x)-delta/2.0
x = tf.constant ([2.0 ])
z = tf.constant ([1.0 ])
with tf.GradientTape (persistent=True) as g0:
g0.watch(x)
with tf.GradientTape (persistent=True) as g :
g.watch (x)
# y = tf.py_function(func=huber, inp=[x, 3.0] , Tout=tf.float32 ) # once-differentiable
y= huber(x, 3.0)
dy_dx = g.gradient(y, x)
aa = tf.reduce_sum(dy_dx *z)
aa_x = g0.gradient(aa, x)
print (dy_dx) # tf.Tensor([0.6666667], shape=(1,), dtype=float32)
print (aa_x) # tf.Tensor([0.33333334], shape=(1,), dtype=float32)
You can read more about tf.py_wrap function in this link.
I want to define a custom LearningRateSchedule, but AutoGraph seems to have trouble to convert it. The following code works fine without #tf.function. But it raises an error when working with #tf.function
def linear_interpolation(l, r, alpha):
return l + alpha * (r - l)
class TFPiecewiseSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
# This class currently cannot be used in #tf.function,
# Since tf.cond See the following link for details
def __init__(self, endpoints, end_learning_rate=None, name=None):
"""Piecewise schedule.
endpoints: [(int, int)]
list of pairs `(time, value)` meanining that schedule should output
`value` when `t==time`. All the values for time must be sorted in
an increasing order. When t is between two times, e.g. `(time_a, value_a)`
and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
`interpolation(value_a, value_b, alpha)` where alpha is a fraction of
time passed between `time_a` and `time_b` for time `t`.
outside_value: float
if the value is requested outside of all the intervals sepecified in
`endpoints` this value is returned. If None then AssertionError is
raised when outside value is requested.
"""
super().__init__()
idxes = [e[0] for e in endpoints]
assert idxes == sorted(idxes)
self.end_learning_rate = end_learning_rate or endpoints[-1][1]
self.endpoints = endpoints
self.name=name
def __call__(self, step):
if step < self.endpoints[0][0]:
return self.endpoints[0][1]
else:
for (l_t, l), (r_t, r) in zip(self.endpoints[:-1], self.endpoints[1:]):
if l_t <= step < r_t:
alpha = float(step - l_t) / (r_t - l_t)
return linear_interpolation(l, r, alpha)
# t does not belong to any of the pieces, so doom.
assert self.end_learning_rate is not None
return self.end_learning_rate
def get_config(self):
return dict(
endpoints=self.endpoints,
end_learning_rate=self.end_learning_rate,
name=self._name,
)
lr = TFPiecewiseSchedule([[10, 1e-3], [20, 1e-4]])
#tf.function
def f(x):
l = layers.Dense(10)
with tf.GradientTape() as tape:
y = l(x)
loss = tf.reduce_mean(y**2)
grads = tape.gradient(loss, l.trainable_variables)
opt = tf.keras.optimizers.Adam(lr)
opt.apply_gradients(zip(grads, l.trainable_variables))
f(tf.random.normal((2, 3)))
The error message says:
:10 f *
opt.apply_gradients(zip(grads, l.trainable_variables))
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:437 apply_gradients
apply_state = self._prepare(var_list)
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:614 _prepare
self._prepare_local(var_device, var_dtype, apply_state)
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/adam.py:154 _prepare_local
super(Adam, self)._prepare_local(var_device, var_dtype, apply_state)
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:620 _prepare_local
lr_t = array_ops.identity(self._decayed_lr(var_dtype))
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:672 _decayed_lr
lr_t = math_ops.cast(lr_t(local_step), var_dtype)
:32 call
if step < self.endpoints[0][0]:
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:765 bool
self._disallow_bool_casting()
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:531 _disallow_bool_casting
"using a tf.Tensor as a Python bool")
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:518 _disallow_when_autograph_enabled
" decorating it directly with #tf.function.".format(task))
OperatorNotAllowedInGraphError: using a tf.Tensor as a Python bool is not allowed: AutoGraph did not convert this function. Try decorating it directly with #tf.function.
I think the error arises because of the if statement, so I replace the content of the __call__ function with the following code. But almost the same error arises.
def compute_lr(step):
for (l_t, l), (r_t, r) in zip(self.endpoints[:-1], self.endpoints[1:]):
if l_t <= step < r_t:
alpha = float(step - l_t) / (r_t - l_t)
return linear_interpolation(l, r, alpha)
# t does not belong to any of the pieces, so doom.
assert self.end_learning_rate is not None
return self.end_learning_rate
return tf.cond(tf.less(step, self.endpoints[0][0]), lambda: self.endpoints[0][1], lambda: compute_lr(step))
What should I do to make the code work as I wish?
The error message is garbled by the markdown formatter, but it seems that the __call__ function itself was not processed by AutoGraph. In the error message, converted functions are marked with an asterisk. This is a bug in the Adam optimizer. Anyway, you can annotate it directly with tf.function it will be picked up:
#tf.function
def __call__(self, step):
That said, there are a few things in the code that AutoGraph doesn't like: zip, returning from a loop, chained inequalities - it's safer to use basic constructs when possible. Sadly the errors are still you get are quite a bit confusing. Rewriting it like this should work:
#tf.function
def __call__(self, step):
if step < self.endpoints[0][0]:
return self.endpoints[0][1]
else:
# Can't return from a loop
lr = self.end_learning_rate
# Since it needs to break based on the value of a tensor, loop
# needs to be a tf.while_loop
for pair in tf.stack([self.endpoints[:-1], self.endpoints[1:]], axis=1):
left, right = tf.unstack(pair)
l_t, l = tf.unstack(left)
r_t, r = tf.unstack(right)
# Chained inequalities not supported yet
if l_t <= step and step < r_t:
alpha = float(step - l_t) / (r_t - l_t)
lr = linear_interpolation(l, r, alpha)
break
return lr
There is one last issue - tf.function doesn't like it when things create variables, so you need to move the creation of the layer and the optimizer outside:
lr = TFPiecewiseSchedule([[10, 1e-3], [20, 1e-4]])
l = layers.Dense(10)
opt = tf.keras.optimizers.Adam(lr)
#tf.function
def f(x):
...
I hope this helps!
I need to use this loss function for a CNN the list_distance and list_residual are output tensors from hidden layers which are important to compute the loss, but when i execute the code it gives me back this error
TypeError: Tensor objects are only iterable when eager execution is enabled. To iterate over this tensor use tf.map_fn.
Is there another way to iterate over tensors without the use of the costruct
x in X or convert it in a numpy array or using the backend function of keras?
def DBL(y_true, y_pred, list_distances, list_residual, l=0.65):
prob_dist = []
Li = []
# mean of the images power spectrum
S = np.sum([np.power(np.abs(fp.fft2(residual)), 2)
for residual in list_residual], axis=0) / K.shape(list_residual)[0]
# log-ratio between the geometric and arithmetic of S
R = np.log10((scistats.gmean(S) / np.mean(S)))
for c_i, dis_i in enumerate(list_distances):
prob_dist.append([
np.exp(-dis_i) / sum([np.exp(-dis_j) if c_j != c_i else 0 for c_j, dis_j in enumerate(list_distances)])
])
for count, _ in enumerate(prob_dist):
Li.append(
-1 * np.log10(sum([p_j for c_j, p_j in enumerate(prob_dist[count])
if y_pred[count] == 1 and count != c_j])))
L0 = np.sum(Li)
return L0 - l * R
You need to define a custom function to feed into tf.map_fn() - Tensorflow dox
Mapper functions map (funnily enough) the existing object (tensor) into a new one using a function you define.
They apply the custom function to every element in the object, without all the mucking about with for loops.
For instance (non tested code, may not run - on my phone atm):
def custom(a):
b = a + 1
return b
original = np.array([2,2,2])
mapped = tf.map_fn(custom, original)
# mapped == [3, 3, 3] ... hopefully
Tensorflow examples all use lambda functions, so you might need to define your functions like that if the above doesn’t work. Tensorflow example:
elems = np.array([1, 2, 3, 4, 5, 6])
squares = map_fn(lambda x: x * x, elems)
# squares == [1, 4, 9, 16, 25, 36]
Edit:
As an aside, map functions are much easier to parallelise than for loops - it is assumed that each element of an object is processed uniquely - so you can see a performance uplift by using them.
Edit 2:
For the "reduce sum, but not on this index" part, I would heavily recommend you start looking back at matrix operations... As mentioned, map functions work element-wise - they are not aware of other elements. A reduce function is what you want, but even they are finiky when you try and do "not this index" sums... also tensorflow is built around matrix ops... Not the MapReduce paradigm.
Something along these lines might help:
sess = tf.Session()
var = np.ones([3, 3, 3]) * 5
zero_identity = tf.linalg.set_diag(
var, tf.zeros(var.shape[0:-1], dtype=tf.float64)
)
exp_one = tf.exp(var)
exp_two = tf.exp(zero_identity)
summed = tf.reduce_sum(exp_two, axis = [0,1])
final = exp_one / summed
print("input matrix: \n", var, "\n")
print("Identities of the matrix to Zero: \n", zero_identity.eval(session=sess), "\n")
print("Exponential Values numerator: \n", exp_one.eval(session=sess), "\n")
print("Exponential Values to Sum: \n", exp_two.eval(session=sess), "\n")
print("Summed values for zero identity matrix\n ... along axis [0,1]: \n", summed.eval(session=sess), "\n")
print("Output:\n", final.eval(session=sess), "\n")
To simplify the problem, say when a dimension (or a feature) is already updated n times, the next time I see the feature, I want to set the learning rate to be 1/n.
I came up with these codes:
def test_adagrad():
embedding = theano.shared(value=np.random.randn(20,10), borrow=True)
times = theano.shared(value=np.ones((20,1)))
lr = T.dscalar()
index_a = T.lvector()
hist = times[index_a]
cost = T.sum(theano.sparse_grad(embedding[index_a]))
gradients = T.grad(cost, embedding)
updates = [(embedding, embedding+lr*(1.0/hist)*gradients)]
### Here should be some codes to update also times which are omitted ###
train = theano.function(inputs=[index_a, lr],outputs=cost,updates=updates)
for i in range(10):
print train([1,2,3],0.05)
Theano does not give any error, but the training result give Nan sometimes. Does anybody know how to correct this please ?
Thank you for your help
PS: I doubt it is the operations in sparse space which creates problems. So I tried to replace * by theano.sparse.mul. This gave the some results as I mentioned before
Perhaps you can utilize the following example for implementation of adadelta, and use it to derive your own. Please update if you succeeded :-)
I was looking for the same thing and ended up implementing it myself in the style of the resource zuuz already pointed out. So maybe this helps anyone looking for help here.
def adagrad(lr, tparams, grads, inp, cost):
# stores the current grads
gshared = [theano.shared(np.zeros_like(p.get_value(),
dtype=theano.config.floatX),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
grads_updates = zip(gshared, grads)
# stores the sum of all grads squared
hist_gshared = [theano.shared(np.zeros_like(p.get_value(),
dtype=theano.config.floatX),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
rgrads_updates = [(rg, rg + T.sqr(g)) for rg, g in zip(hist_gshared, grads)]
# calculate cost and store grads
f_grad_shared = theano.function(inp, cost,
updates=grads_updates + rgrads_updates,
on_unused_input='ignore')
# apply actual update with the initial learning rate lr
n = 1e-6
updates = [(p, p - (lr/(T.sqrt(rg) + n))*g)
for p, g, rg in zip(tparams.values(), gshared, hist_gshared)]
f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')
return f_grad_shared, f_update
I find this implementation from Lasagne very concise and readable. You can use it pretty much as it is:
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
accu_new = accu + grad ** 2
updates[accu] = accu_new
updates[param] = param - (learning_rate * grad /
T.sqrt(accu_new + epsilon))