I am trying to train a model using TensorFlow. There is a bottleneck in my code, which make my Tensor flow optimization to crash. I found out that this most likely is relating to a part in my code where I have a loop:
here is a minimum working example:
import numpy as np
import tensorflow as tf
import scipy.optimize
kon = 0.01
mu = 1.5
fi = 0.5
kappa = 22
w = (1-theta)
n =100
xs = tf.random.normal(shape=(n,), stddev=0.2)
eps = tf.random.normal(shape=(n,), stddev=0.17)
z = tf.sigmoid(tf.random.normal(shape=(n,), stddev=0.22))
def my_function(z, eps, x0):
def F(hi):
return (mu/fi)*np.log(hi) -(1-mu)*kappa*(hi)**(1+(1/fi))-mu*(np.log(w*ei*xs)-np.log(kon))-np.log(ze)
hvec = np.empty((0,))
# leisure today
for ze,ei,xs in zip(z, eps, x0):
ei=np.exp(ei)
xs=np.exp(xs)
htemp = scipy.optimize.newton_krylov(F, 0.5)
hvec = np.append(hvec, htemp)
return hvec
if I use the tf.function to decorate my function I get this error.
#tf.function
def my_function(z, eps, x0):
def F(hi):
return (mu/fi)*np.log(hi) -(1-mu)*kappa*(hi)**(1+(1/fi))-mu*(np.log(w*ei*xs)-np.log(kon))-np.log(ze)
hvec = np.empty((0,))
# leisure today
for ze,ei,xs in zip(z, eps, x0):
ei=np.exp(ei)
xs=np.exp(xs)
htemp = scipy.optimize.newton_krylov(F, 0.5)
hvec = np.append(hvec, htemp)
return hvec
htest=my_function(z,eps,x0)
ERROR MESSAGE
OperatorNotAllowedInGraphError: in converted code:
.....
OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed: AutoGraph did not convert this function. Try decorating it directly with #tf.function.
I tried to follow this route:
#tf.function(input_signature=[tf.TensorSpec(None, tf.float32)])
def tf_function(input):
y = tf.numpy_function(my_function, [input], tf.float32)
return y
but the error message that I get once I call the tf_function is:
htestTF= tf_function(z,eps,x0)
Error
TypeError: When input_signature is provided, only pass arguments covered by it. Received 3 argument(s).
Can someone who has experience help me to debugg this?
I believe it's the zip function, which autograph doesn't currently transform: you need to use for i in range(len(z)) and then ze = z[i].
That said, your function uses only NumPy so it won't work as expected: tf.function requires TensorFlow code.
To train in TensorFlow, you would need to change all the np.* calls to their equivalent tf.*, replace hvec with hvec = tf.TensorArray(...), and replace scipy.optimize.newton_krylov with a TF-based implementation.
Related
I want to define a custom LearningRateSchedule, but AutoGraph seems to have trouble to convert it. The following code works fine without #tf.function. But it raises an error when working with #tf.function
def linear_interpolation(l, r, alpha):
return l + alpha * (r - l)
class TFPiecewiseSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
# This class currently cannot be used in #tf.function,
# Since tf.cond See the following link for details
def __init__(self, endpoints, end_learning_rate=None, name=None):
"""Piecewise schedule.
endpoints: [(int, int)]
list of pairs `(time, value)` meanining that schedule should output
`value` when `t==time`. All the values for time must be sorted in
an increasing order. When t is between two times, e.g. `(time_a, value_a)`
and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
`interpolation(value_a, value_b, alpha)` where alpha is a fraction of
time passed between `time_a` and `time_b` for time `t`.
outside_value: float
if the value is requested outside of all the intervals sepecified in
`endpoints` this value is returned. If None then AssertionError is
raised when outside value is requested.
"""
super().__init__()
idxes = [e[0] for e in endpoints]
assert idxes == sorted(idxes)
self.end_learning_rate = end_learning_rate or endpoints[-1][1]
self.endpoints = endpoints
self.name=name
def __call__(self, step):
if step < self.endpoints[0][0]:
return self.endpoints[0][1]
else:
for (l_t, l), (r_t, r) in zip(self.endpoints[:-1], self.endpoints[1:]):
if l_t <= step < r_t:
alpha = float(step - l_t) / (r_t - l_t)
return linear_interpolation(l, r, alpha)
# t does not belong to any of the pieces, so doom.
assert self.end_learning_rate is not None
return self.end_learning_rate
def get_config(self):
return dict(
endpoints=self.endpoints,
end_learning_rate=self.end_learning_rate,
name=self._name,
)
lr = TFPiecewiseSchedule([[10, 1e-3], [20, 1e-4]])
#tf.function
def f(x):
l = layers.Dense(10)
with tf.GradientTape() as tape:
y = l(x)
loss = tf.reduce_mean(y**2)
grads = tape.gradient(loss, l.trainable_variables)
opt = tf.keras.optimizers.Adam(lr)
opt.apply_gradients(zip(grads, l.trainable_variables))
f(tf.random.normal((2, 3)))
The error message says:
:10 f *
opt.apply_gradients(zip(grads, l.trainable_variables))
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:437 apply_gradients
apply_state = self._prepare(var_list)
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:614 _prepare
self._prepare_local(var_device, var_dtype, apply_state)
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/adam.py:154 _prepare_local
super(Adam, self)._prepare_local(var_device, var_dtype, apply_state)
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:620 _prepare_local
lr_t = array_ops.identity(self._decayed_lr(var_dtype))
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:672 _decayed_lr
lr_t = math_ops.cast(lr_t(local_step), var_dtype)
:32 call
if step < self.endpoints[0][0]:
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:765 bool
self._disallow_bool_casting()
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:531 _disallow_bool_casting
"using a tf.Tensor as a Python bool")
/Users/aptx4869/anaconda3/envs/drl/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:518 _disallow_when_autograph_enabled
" decorating it directly with #tf.function.".format(task))
OperatorNotAllowedInGraphError: using a tf.Tensor as a Python bool is not allowed: AutoGraph did not convert this function. Try decorating it directly with #tf.function.
I think the error arises because of the if statement, so I replace the content of the __call__ function with the following code. But almost the same error arises.
def compute_lr(step):
for (l_t, l), (r_t, r) in zip(self.endpoints[:-1], self.endpoints[1:]):
if l_t <= step < r_t:
alpha = float(step - l_t) / (r_t - l_t)
return linear_interpolation(l, r, alpha)
# t does not belong to any of the pieces, so doom.
assert self.end_learning_rate is not None
return self.end_learning_rate
return tf.cond(tf.less(step, self.endpoints[0][0]), lambda: self.endpoints[0][1], lambda: compute_lr(step))
What should I do to make the code work as I wish?
The error message is garbled by the markdown formatter, but it seems that the __call__ function itself was not processed by AutoGraph. In the error message, converted functions are marked with an asterisk. This is a bug in the Adam optimizer. Anyway, you can annotate it directly with tf.function it will be picked up:
#tf.function
def __call__(self, step):
That said, there are a few things in the code that AutoGraph doesn't like: zip, returning from a loop, chained inequalities - it's safer to use basic constructs when possible. Sadly the errors are still you get are quite a bit confusing. Rewriting it like this should work:
#tf.function
def __call__(self, step):
if step < self.endpoints[0][0]:
return self.endpoints[0][1]
else:
# Can't return from a loop
lr = self.end_learning_rate
# Since it needs to break based on the value of a tensor, loop
# needs to be a tf.while_loop
for pair in tf.stack([self.endpoints[:-1], self.endpoints[1:]], axis=1):
left, right = tf.unstack(pair)
l_t, l = tf.unstack(left)
r_t, r = tf.unstack(right)
# Chained inequalities not supported yet
if l_t <= step and step < r_t:
alpha = float(step - l_t) / (r_t - l_t)
lr = linear_interpolation(l, r, alpha)
break
return lr
There is one last issue - tf.function doesn't like it when things create variables, so you need to move the creation of the layer and the optimizer outside:
lr = TFPiecewiseSchedule([[10, 1e-3], [20, 1e-4]])
l = layers.Dense(10)
opt = tf.keras.optimizers.Adam(lr)
#tf.function
def f(x):
...
I hope this helps!
My question is if there was an issue in changing def step(self,x) function since the original was faulty.
I attempted to change def step(self,x) to x.any. It resulted in a prediction error where all predictions were 1 I attempted to implement an OR Perceptron neural network from a book by following the codes given. However, I received an error The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
This is the code:
from nn import Perceptron
import numpy as np
X = np.array([[0,0],[0,1],[1,0],[1,1]])
print(X[1])
y = np.array([0],[1],[1],[0])
print("[INFO] training perceptron...")
p = Perceptron(X.shape[1],alpha = 0.1)
p.fit(X,y,epochs=20)
print("[INFO] testing perceptron...")
for (x,target) in zip(X,y):
pred=p.predict(X)
print("[INFO] data={}, ground-truth={}, pred={}". format(x, target[0], pred))
The package that I imported was:
import numpy as np
class Perceptron:
def __init__(self, N, alpha = 0.1):
self.W = np.random.randn(N+1)/np.sqrt(N)
self.alpha = alpha
def step(self,x):
if x>0:
return 1
else:
return 0
def fit(self, X, y, epochs = 10):
X = np.c_[X,np.ones((X.shape[0]))]
for epoch in np.arange(0, epochs):
for (x,target) in zip(X,y):
p = self.step(np.dot(x, self.W))
if p!= target:
error = p-target
self.W += -self.alpha * error * x
def predict(self,X,addBias=True):
X = np.atleast_2d(X)
if addBias:
X=np.c_[X, np.ones((X.shape[0]))]
return self.step(np.dot(X,self.W))
My apologies if its a silly question as I spent the whole day thinking about it to no avail.
Thanks in advance!
The error that you are facing is because step() is coded to evaluate 1 element of the array at a time but when you pass an array to it in the predict function it has to do something like this:
[0.266,1.272,-1.282,0.889] > 1
The interpreter doesn't know which value to evaluate since it's an array and hence gives the error. Using any or all would check for 'any' or 'all' value in the array and give you 0 or 1 correspondingly, which is why you get an array of 1s when you write x.any().
Another thing that bothered me about the code you imported was that the forward pass is done in a loop, which is not very efficient or pythonic. A vectorized implementation is way better. I have changed the step function and fit function in that imported code to be vectorized and it runs fine for me.
import numpy as np
class Perceptron:
def __init__(self, N, alpha = 0.1):
self.W = np.random.randn(N+1)/np.sqrt(N)
self.alpha = alpha
def step(self,x):
return 1. * (x > 0)
def fit(self, X, y, epochs = 10):
X = np.c_[X,np.ones((X.shape[0]))]
for epoch in np.arange(0, epochs):
Z = np.dot(X, self.W)
p = self.step(Z)
if np.any(p != y):
error = (p-y)
self.W += -self.alpha * np.dot(X.T,error)
def predict(self,X,addBias=True):
X = np.atleast_2d(X)
if addBias:
X=np.c_[X, np.ones((X.shape[0]))]
return self.step(np.dot(X,self.W))
Now the step function is returning a binary array where the value is 1 when the input is greater than 0 else 0. For example if you had an array say:
X= [0.266,1.272,-1.282,0.889]
would be converted to:
[1,1,0,1]
I also changed the fit function so that it does everything vectorized.
One other thing that I did to my code was this :
Instead of
y = np.array([0],[1],[1],[0])
I did
y = np.array([0,1,1,0])
to get it working. I hope this helps. Be sure to ask anything if you don't understand.
I want to create heaviside step function in TensorFlow. Since Heaviside function is not differentiable I also need to choose derivative approximation and define custom gradient so full implementation looks like this:
import tensorflow as tf
#tf.RegisterGradient("HeavisideGrad")
def _heaviside_grad(unused_op: tf.Operation, grad: tf.Tensor):
x = unused_op.inputs[0]
# During backpropagation heaviside behaves like sigmoid
return tf.sigmoid(x) * (1 - tf.sigmoid(x)) * grad
def heaviside(x: tf.Tensor, g: tf.Graph = tf.get_default_graph()):
custom_grads = {
"Sign": "HeavisideGrad"
}
with g.gradient_override_map(custom_grads):
# TODO: heaviside(0) currently returns 0. We need heaviside(0) = 1
sign = tf.sign(x)
# tf.stop_gradient is needed to exclude tf.maximum from derivative
step_func = sign + tf.stop_gradient(tf.maximum(0.0, sign) - sign)
return step_func
There is one caveat in my implementation: tf.sign(0) returns zero value so heaviside(0) also returns zero and I want heaviside(0) to return 1. How can I achieve such behavior?
A very hacky way would be to use
1 - max(0.0, sign(-x))
as your step function instead of
max(0.0, sign(x))
Another option would be to use greater_equal and cast the result to your desired type, and override its gradient with the sigmoid override you already have.
Ok, I think I figured it out. Many thanks to etarion who pointed out the correct approach to solve my issue.
So the basic idea is to use tf.greater_equal instead of combination of tf.sign and maximum. The custom gradient is applied to tf.identity operation.
Here is updated implementation of heaviside function:
import tensorflow as tf
#tf.RegisterGradient("HeavisideGrad")
def _heaviside_grad(unused_op: tf.Operation, grad: tf.Tensor):
return tf.maximum(0.0, 1.0 - tf.abs(unused_op.inputs[0])) * grad
def heaviside(x: tf.Tensor, g: tf.Graph = tf.get_default_graph()):
custom_grads = {
"Identity": "HeavisideGrad"
}
with g.gradient_override_map(custom_grads):
i = tf.identity(x, name="identity_" + str(uuid.uuid1()))
ge = tf.greater_equal(x, 0, name="ge_" + str(uuid.uuid1()))
# tf.stop_gradient is needed to exclude tf.to_float from derivative
step_func = i + tf.stop_gradient(tf.to_float(ge) - i)
return step_func
This would make the unit step function, using only TensorFlow APIs so the result is still a tensor:
#in Eager mode
def heaviside(v):
return 1-tf.reduce_max(tf.constant([0,-tf.sign(v).numpy()], tf.float32));
In TensorFlow 2, use the decorator #tf.custom_gradient better:
#tf.custom_gradient
def heaviside(X):
#This custom op is converted to graph, no 'if', 'else' allowed,
#so use 'tf.cond'
List = [];
for I in range(BSIZE): #Batch size
Item = tf.cond(X[I]<0, lambda: tf.constant([0], tf.float32),
lambda: tf.constant([1], tf.float32));
List.append(Item);
U = tf.stack(List);
#Heaviside half-maximum formula
#U = (tf.sign(X)+1)/2;
#Div is differentiation intermediate value
def grad(Div):
return Div*1; #Heaviside has no gradient, use 1.
return U,grad;
Easiest fix for you code is to add a small number to the result of tf.sign() and take the sign again. This will result in getting a 1 for 0:
sign = tf.sign ( tf.sign( x ) + 0.1 )
I'm trying to implement the Hinge loss function in Python and faced with some misleadings.
In some sources that I used to read (for example, "Regression Analysis in Python"under Luca Massoron) states that Hinge sometimes calls as Softmax function.
But for me it is kind of strange because, Hinge:
and Softmax is just exponential function like:
I made that function in Python (for Softmax) this way:
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x/e_x.sum(axis=0)
Have two questions:
Can I use that softmax function like an equivalent to hinge function?
If not, how can hinge be implemented in Python?
Thanks.
Can I use that softmax function like an equivalent to hinge function?
no - they are not equivalent.
a hinge function is a loss function and do not provide well-calibrated probabilities, whereas softmax is a mapping function (one that maps a set of scores into a distribution, one that sums to one).
If not, how can hinge be implemented in Python?
this following snippet captures the essence of hinge loss functions:
import numpy as np
import matplotlib.pyplot as plt
xmin, xmax = -1, 2
xx = np.linspace(xmin, xmax, 100)
plt.plot(xx, np.where(xx < 1, 1 - xx, 0), label="Hinge loss")
you can also implement softmax functions in pure python :)
import numpy as np
import math as math
def sofyMax(data):
# pure python
# math:: $rezult(powe,sumColumn) = \dfrac{powe(data)}{sumColumn(powe(data))}$
def powe(data):
outp = [[] for _ in range(len(data))]
for column in range(len(data[0])):
r = 0
for row in data:
outp[r]+=[math.exp(row[column])]
r+=1
return outp
def sumColumn(data):
outps = []
for column in range(len(data[0])):
total = 0
for row in data:
total+=row[column]
outps += [total]
return outps
def rezult(data,sumcolumn):
outp = [[] for _ in range(len(data))]
l = 0
for row in data:
for c,s in zip(row,sumcolumn) :
outp[l] += [c/s]
l+=1
return outp
et1 = powe(data)
et2 = sumColumn(et1)
return rezult(et1,et2)
data = np.random.randn(10,5)
(np.exp(data)/np.sum(np.exp(data),axis=0)) == (np.array(sofyMax(data)))
To simplify the problem, say when a dimension (or a feature) is already updated n times, the next time I see the feature, I want to set the learning rate to be 1/n.
I came up with these codes:
def test_adagrad():
embedding = theano.shared(value=np.random.randn(20,10), borrow=True)
times = theano.shared(value=np.ones((20,1)))
lr = T.dscalar()
index_a = T.lvector()
hist = times[index_a]
cost = T.sum(theano.sparse_grad(embedding[index_a]))
gradients = T.grad(cost, embedding)
updates = [(embedding, embedding+lr*(1.0/hist)*gradients)]
### Here should be some codes to update also times which are omitted ###
train = theano.function(inputs=[index_a, lr],outputs=cost,updates=updates)
for i in range(10):
print train([1,2,3],0.05)
Theano does not give any error, but the training result give Nan sometimes. Does anybody know how to correct this please ?
Thank you for your help
PS: I doubt it is the operations in sparse space which creates problems. So I tried to replace * by theano.sparse.mul. This gave the some results as I mentioned before
Perhaps you can utilize the following example for implementation of adadelta, and use it to derive your own. Please update if you succeeded :-)
I was looking for the same thing and ended up implementing it myself in the style of the resource zuuz already pointed out. So maybe this helps anyone looking for help here.
def adagrad(lr, tparams, grads, inp, cost):
# stores the current grads
gshared = [theano.shared(np.zeros_like(p.get_value(),
dtype=theano.config.floatX),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
grads_updates = zip(gshared, grads)
# stores the sum of all grads squared
hist_gshared = [theano.shared(np.zeros_like(p.get_value(),
dtype=theano.config.floatX),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
rgrads_updates = [(rg, rg + T.sqr(g)) for rg, g in zip(hist_gshared, grads)]
# calculate cost and store grads
f_grad_shared = theano.function(inp, cost,
updates=grads_updates + rgrads_updates,
on_unused_input='ignore')
# apply actual update with the initial learning rate lr
n = 1e-6
updates = [(p, p - (lr/(T.sqrt(rg) + n))*g)
for p, g, rg in zip(tparams.values(), gshared, hist_gshared)]
f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')
return f_grad_shared, f_update
I find this implementation from Lasagne very concise and readable. You can use it pretty much as it is:
for param, grad in zip(params, grads):
value = param.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
accu_new = accu + grad ** 2
updates[accu] = accu_new
updates[param] = param - (learning_rate * grad /
T.sqrt(accu_new + epsilon))