I am trying to use tf.nn.weighted_cross_entropy_with_logits API, but I found I just can not get the right result when the weight is not 1.0 (1.0 means no weight).
import tensorflow as tf
import numpy as np
def my_binary_crossentropy_np(labels, output, weight=10.0):
"""
Weighted binary crossentropy between an output tensor
and a target tensor.
"""
# transform back to logits
epsilon = 1e-08
np.clip(output, epsilon, 1.0 - epsilon, out=output)
output = np.log(output / (1.0 - output))
# https://www.tensorflow.org/api_docs/python/tf/nn/weighted_cross_entropy_with_logits
# l = 1 + (q - 1) * z
# (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
l = 1.0 + (weight - 1.0) * labels
loss1 = np.multiply(1.0 - labels, output)
loss2 = np.multiply(l, np.log(1.0 + np.exp(-abs(output))))
loss3 = np.maximum(-output, 0)
loss = loss1 + loss2 + loss3
return np.mean(loss)
def my_binary_crossentropy_tf(labels, output, weight=1.0):
"""
Weighted binary crossentropy between an output tensor
and a target tensor.
"""
epsilon = 1e-08
output = tf.clip_by_value(output, epsilon, 1.0 - epsilon)
output = tf.log(output / (1.0 - output))
# compute weighted loss
#loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=output)
loss = tf.nn.weighted_cross_entropy_with_logits(targets=labels, logits=output, pos_weight=weight)
return tf.reduce_mean(loss)
# generate random test data and random label
predict = np.random.rand(10, 8)
label = np.random.rand(10, 8)
label[label >= 0.5] = 1
label[label < 0.5] = 0
loss1 = my_binary_crossentropy_np(label, predict, 1.0)
print('loss1 = ', loss1)
loss1 = my_binary_crossentropy_np(label, predict, 10.0)
print('loss1 = ', loss1)
predict_tf = tf.convert_to_tensor(predict)
loss2 = my_binary_crossentropy_tf(label, predict_tf, 1.0)
loss2 = tf.Session().run(loss2)
print('loss2 = ', loss2)
loss2 = my_binary_crossentropy_tf(label, predict_tf, 10.0)
loss2 = tf.Session().run(loss2)
print('loss2 = ', loss2)
running result:
loss1 = 1.02193164517
loss1 = 1.96332399324
loss2 = 1.02193164517
loss2 = 4.80529539791
The implementation of my_binary_crossentropy_np is wrong.
Here is the right one:
l = (weight - 1.0) * labels + 1.0
loss1 = np.multiply(1.0 - labels, output)
loss2 = np.multiply(l, np.log(1.0 + np.exp(-abs(output))) + np.maximum(-output, 0))
loss = loss1 + loss2
Related
I train U-Net model for segment particular areas (3 classes + background) in CT scan.
To evaluate the model performance i use custom accuracy function:
(1 - alpha) * dice + alpha * hausdorff , 0<alpha<1
During training, the hausdorff part increase too fast ,after several epochs achieved to 98%, compare to the dice (30%-40%).
In the end of the training:
total accuracy: ~73% | dice: ~50% | hausdorff: ~98%
Additional information about the u-net model:
#epochs - 1 (training time - 9 hours).
optimizer - Adam.
use data augmentation.
use dropout in the downsample & upsample path.
The evaluation function:
Dice:
import segmentation_models as sm
def dice(y_true, y_pred):
return 1. - sm.losses.dice_loss(gt=y_true, pr=y_pred)
hausdorff distance:
import numpy as np
from monai.metrics.utils import get_mask_edges, get_surface_distance
def hausdorff(y_true, y_pred):
h, w = y_true.shape[1], y_true.shape[2]
max_dist = np.sqrt(w ** 2 + h ** 2)
(edges_pred, edges_gt) = get_mask_edges(y_pred, y_true)
surface_distance = get_surface_distance(edges_pred, edges_gt,
distance_metric="euclidean")
if surface_distance.shape == (0,):
return 1.0
dist = np.max(surface_distance)
if dist > max_dist:
return 0.0
return 1. - dist / max_dist
Custom accuracy:
import keras.backend as K
def accuracy(y_true, y_pred):
y_true = tf.cast(y_true, dtype=tf.float32)
y_pred = tf.cast(y_pred, dtype=tf.float32)
y_true_2d, y_pred_2d = tf.argmax(y_true, axis=-1), tf.argmax(y_pred, axis=-1)
if np.all(y_true_2d.numpy() == y_pred_2d.numpy()):
return 0 * K.sum(y_pred) + 1.
d = dice(y_true, y_pred)
h = hausdorff(y_true, y_pred)
acc = (1. - alpha) * d + alpha * h
return acc
The hausdorff distance implementation is correct in this case? any additional suggestions how to solve this?
Thanks,
I'm trying to implement linear regression using Rms Prop optimizer from scratch.
Code:
EPOCHS = 100
w3 = tf.Variable(w_vector, dtype = tf.float32)
w4 = tf.Variable(0, dtype = tf.float32)
lr = 1e-5
beta = 0.9
epilson = 1e-7
momentum = 0.0
for epoch in range(1,EPOCHS+1):
mom_w = 0
mom_b = 0
mean_square_w = 0
mean_gradient_w = 0
mean_square_b = 0
mean_gradient_b = 0
y_pred1 = tf.squeeze(tf.matmul(w3,x, transpose_a = True, transpose_b = True) + w4)
dw3, dw4 = gradients_mse(x, y, y_pred1)
# My eqautions for RMS prop
mean_square_w = beta * mean_square_w + (1-beta) * dw3 ** 2
mean_gradient_w = beta * mean_gradient_w + (1-beta) * dw3
mom_w = momentum * mom_w + lr * (dw3/(tf.sqrt(mean_square_w + epilson - mean_gradient_w ** 2)))
mean_square_b = beta * mean_square_b + (1-beta) * dw4 ** 2
mean_gradient_b = beta * mean_gradient_b + (1-beta) * dw4
mom_b = momentum * mom_b + lr * (dw4/(tf.sqrt(mean_square_b + epilson - mean_gradient_b ** 2)))
w3.assign_sub(mom_w)
w4.assign_sub(mom_b)
print('w3 : {}'.format(w3.numpy()))
print('w4 : {}'.format(w4.numpy()))
Output:
w3 : [[-1.2507935]]
w4 : 0.0033333366736769676
Now I create a single layer and single neuron neural network with no activation function. Assign the same weights in its neuron and use RMS prop as optimizer I get different final weights. However, this was not the case for sgd optimizer.
Code:
# using keras to get same results
def create_model():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units = 1, name = 'd1', input_shape = (x.shape[1],)))
model.compile(optimizer=tf.keras.optimizers.RMSprop(
learning_rate=1e-5, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False),
loss="mse")
return model
model = create_model()
d1 = model.get_layer('d1')
d1_weights = [tf.constant(w_vector, dtype = tf.float32), tf.constant(np.array([0]), dtype = tf.float32)]
d1.set_weights(d1_weights)
model.fit(x, y, epochs = 100)
d1 = model.get_layer('d1')
print('w3 = {}'.format(d1.weights[0].numpy()))
print('w4 = {}'.format(d1.weights[1].numpy()[0]))
Output:
w3 = [[-1.2530397]]
w4 = 0.0010913893347606063
My gradients are calculate correctly for mse loss function. I have crosschecked them with tensorflows inbuilt gradient computation function gradient tape.
Code:
# Computing gradients
def gradients_mse(X, Y, Y_PREDS):
DW1 = tf.matmul(X, tf.reshape(Y-Y_PREDS, (X.shape[0],1)), transpose_a = True) * (-2/X.shape[0])
DW0 = (-2 / X.shape[0]) * tf.reduce_sum(Y - Y_PREDS)
return DW1, DW0
The only thing that can go wrong in this implementation is I think calculation of mom_w and mom_b using incorrect equations.
x.shape = [10,1]
The default batch size is 32 so it will have no effects on weight updates. The same code gives perfectly matching output when I try to use simple gradient descent instead of RMS prop.
I am new into tensorflow and try to understand how the computation graph works. I am working on the very basic linear regression example on the tensorflow website. I have the following piece of code:
import numpy as np
import tensorflow as tf
def manual_loss(_w, _b, _x, _y):
_loss = 0.0
n = len(_x)
for j in range(n):
_loss += (_w * _x[j] + _b - _y[j]) ** 2
return _loss
def manual_grads(_w, _b, _x, _y):
n = len(_x)
g_w = 0.0
g_b = 0
for j in range(n):
g_w += 2.0 * (_w * _x[j] + _b - _y[j]) * _x[j]
g_b += 2.0 * (_w * _x[j] + _b - _y[j])
return g_w, g_b
# Model parameters
W = tf.Variable([0.3], dtype=tf.float32)
b = tf.Variable([-0.3], dtype=tf.float32)
_W = 0.3
_b = -0.3
# Model input and output
x = tf.placeholder(tf.float32)
linear_model = W * x + b
y = tf.placeholder(tf.float32)
# loss
loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
grads = tf.gradients(loss, [W, b])
# training data
x_train = [1, 2, 3, 4]
y_train = [0, -1, -2, -3]
# training loop
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
lr = 0.001
for i in range(1000):
results = sess.run([loss, W, b, grads], {x: x_train, y: y_train})
loss_value = results[0]
W_value = results[1]
b_value = results[2]
grad_W = results[3][0]
grad_b = results[3][1]
manual_loss_value = manual_loss(_w=_W, _b=_b, _x=x_train, _y=y_train)
manual_grad_W, manual_grad_b = manual_grads(_w=_W, _b=_b, _x=x_train, _y=y_train)
new_W_value = W_value - lr * grad_W
new_b_value = b_value - lr * grad_b
W = tf.assign(W, value=new_W_value)
b = tf.assign(b, value=new_b_value)
print("***********************")
print("loss={0}".format(loss_value))
print("manual_loss_value={0}".format(manual_loss_value))
print("W={0}".format(W_value))
print("b={0}".format(b_value))
print("manual_W={0}".format(_W))
print("manual_b={0}".format(_b))
print("grad_W={0}".format(grad_W))
print("grad_b={0}".format(grad_b))
print("manual_grad_W={0}".format(manual_grad_W))
print("manual_grad_b={0}".format(manual_grad_b))
print("***********************")
_W -= lr * manual_grad_W
_b -= lr * manual_grad_b
I just try to apply gradient descent to a simple (w*X - b - Y)^2 model. I don't use Tensorflow's own optimizer purposefully, I want to understand the underlying graph update mechanisms. In order to check that the system calculates correct gradients, I implemented my own loss and gradient calculation functions for linear regression as well. Unfortunately, it seems that tensorflow does not calculate the loss function and the gradients as expected. Here is what I get as an output:
***********************
loss=23.65999984741211
manual_loss_value=23.659999999999997
W=[ 0.30000001]
b=[-0.30000001]
manual_W=0.3
manual_b=-0.3
grad_W=[ 52.]
grad_b=[ 15.59999943]
manual_grad_W=52.0
manual_grad_b=15.599999999999998
***********************
***********************
loss=23.65999984741211
manual_loss_value=20.81095744
W=[ 0.24800001]
b=[-0.31560001]
manual_W=0.248
manual_b=-0.3156
grad_W=[ 52.]
grad_b=[ 15.59999943]
manual_grad_W=48.568
manual_grad_b=14.4352
***********************
As you can see, tensorflow calculates incorrect loss value and gradients for W and b in the second iteration, actually the same ones as the first iteration. In some trials, it starts to diverge from the actual values from third or fourth iterations; not always in the second one. Am I doing something wrong here? As soon as I get the values of W and b and their gradients, I update their values with tf.assign() in the training loop. Does the problem lie here; is it a wrong way to update variables with tensorflow? It is really discouraging to run into such problems just at the start.
I think there is a problem with the use of tf.assign. The command tf.assign creates assign nodes, that should be run to be effective. You should change to something like
assign_W_placeholder = tf.placeholder(tf.float32)
assign_b_placeholder = tf.placeholder(tf.float32)
assign_W_node = tf.assign(W, assign_W_placeholder)
assign_b_node = tf.assign(b, assign_b_placeholder)
and then in the for loop, add something like
sess.run(assign_W_node, feed_dict={assign_W_placeholder: new_W_value}
sess.run(assign_b_node, feed_dict={assign_b_placeholder: new_b_value}
After these, tensorflow and manual give the same results.
The complete code:
import numpy as np
import tensorflow as tf
def manual_loss(_w, _b, _x, _y):
_loss = 0.0
n = len(_x)
for j in range(n):
_loss += (_w * _x[j] + _b - _y[j]) ** 2
return _loss
def manual_grads(_w, _b, _x, _y):
n = len(_x)
g_w = 0.0
g_b = 0
for j in range(n):
g_w += 2.0 * (_w * _x[j] + _b - _y[j]) * _x[j]
g_b += 2.0 * (_w * _x[j] + _b - _y[j])
return g_w, g_b
# Model parameters
W = tf.Variable([0.3], dtype=tf.float32)
b = tf.Variable([-0.3], dtype=tf.float32)
_W = 0.3
_b = -0.3
# Model input and output
x = tf.placeholder(tf.float32)
linear_model = W * x + b
y = tf.placeholder(tf.float32)
assign_W_placeholder = tf.placeholder(tf.float32)
assign_b_placeholder = tf.placeholder(tf.float32)
assign_W_node = tf.assign(W, assign_W_placeholder)
assign_b_node = tf.assign(b, assign_b_placeholder)
# loss
loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
grads = tf.gradients(loss, [W, b])
# training data
x_train = [1, 2, 3, 4]
y_train = [0, -1, -2, -3]
# training loop
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
lr = 0.001
for i in range(1000):
results = sess.run([loss, W, b, grads], {x: x_train, y: y_train})
loss_value = results[0]
W_value = results[1]
b_value = results[2]
grad_W = results[3][0]
grad_b = results[3][1]
manual_loss_value = manual_loss(_w=_W, _b=_b, _x=x_train, _y=y_train)
manual_grad_W, manual_grad_b = manual_grads(_w=_W, _b=_b, _x=x_train, _y=y_train)
new_W_value = W_value - lr * grad_W
new_b_value = b_value - lr * grad_b
sess.run([assign_W_node, assign_b_node],
feed_dict={assign_W_placeholder: new_W_value, assign_b_placeholder: new_b_value})
print("***********************")
print("loss={0}".format(loss_value))
print("manual_loss_value={0}".format(manual_loss_value))
print("W={0}".format(W_value))
print("b={0}".format(b_value))
print("manual_W={0}".format(_W))
print("manual_b={0}".format(_b))
print("grad_W={0}".format(grad_W))
print("grad_b={0}".format(grad_b))
print("manual_grad_W={0}".format(manual_grad_W))
print("manual_grad_b={0}".format(manual_grad_b))
print("***********************")
_W -= lr * manual_grad_W
_b -= lr * manual_grad_b
I think you have a problem of numeric precision. Numpy uses double floats by default (64 bits). You are declaring your tensors as tf.float32. Try to change them to tf.float64.
Edit: I think the difference is due to the exponentiation in the loss function. Try to change for a multiplication as in:
_loss += (_w * _x[j] + _b - _y[j]) * (_w * _x[j] + _b - _y[j])
import numpy as np
import tensorflow as tf
def manual_loss(_w, _b, _x, _y):
_loss = 0.0
n = len(_x)
for j in range(n):
diff = (_w * _x[j] + _b - _y[j])
_loss += diff * diff
return _loss
def manual_grads(_w, _b, _x, _y):
n = len(_x)
g_w = 0.0
g_b = 0
for j in range(n):
g_w += 2.0 * (_w * _x[j] + _b - _y[j]) * _x[j]
g_b += 2.0 * (_w * _x[j] + _b - _y[j])
return g_w, g_b
# Model parameters
W = tf.Variable([0.3], dtype=tf.float64)
b = tf.Variable([-0.3], dtype=tf.float64)
_W = 0.3
_b = -0.3
# Model input and output
x = tf.placeholder(tf.float64)
linear_model = W * x + b
y = tf.placeholder(tf.float64)
# loss
loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
grads = tf.gradients(loss, [W, b])
# training data
x_train = [1, 2, 3, 4]
y_train = [0, -1, -2, -3]
# training loop
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
lr = 0.001
for i in range(10):
with tf.device('cpu:0'):
results = sess.run([loss, grads], {x: x_train, y: y_train})
loss_value = results[0]
grad_W = results[1][0]
grad_b = results[1][1]
manual_loss_value = manual_loss(_w=_W, _b=_b, _x=x_train, _y=y_train)
manual_grad_W, manual_grad_b = manual_grads(_w=_W, _b=_b, _x=x_train, _y=y_train)
new_W_value = (W - lr * grad_W).eval(session = sess)
new_b_value = (b - lr * grad_b).eval(session = sess)
tf.assign(W, value=new_W_value).eval(session = sess)
tf.assign(b, value=new_b_value).eval(session = sess)
print("***********************")
print("loss={0}".format(loss_value))
print("manual_loss_value={0}".format(manual_loss_value))
print("W={0}".format(W.eval(session = sess)))
print("b={0}".format(b.eval(session = sess)))
print("manual_W={0}".format(_W))
print("manual_b={0}".format(_b))
print("grad_W={0}".format(grad_W))
print("grad_b={0}".format(grad_b))
print("manual_grad_W={0}".format(manual_grad_W))
print("manual_grad_b={0}".format(manual_grad_b))
print("***********************")
_W -= lr * manual_grad_W
_b -= lr * manual_grad_b
I've been trying to implement a simple version of normalizing flows with Keras, as explained in this paper: https://arxiv.org/pdf/1505.05770.pdf
My problem is that the loss is always -infinity, and I can't get what I did wrong. Can anybody help me ?
Here is the procedure:
the encoder generates vectors of size latent_dim = 100. These are z_mean, z_log_var, u, b, w.
From z_mean and z_log_var, using the reparametrization trick I can sample z_0 ~ N(z_mean, z_log_var).
Then I can compute log(abs(1+u.T.dot(psi(z_0))))
Then I can compute z_1
Here is the code for those four steps:
def sampling(args):
z_mean, z_log_var = args
# sample epsilon according to N(O,I)
epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
std=epsilon_std)
# generate z0 according to N(z_mean, z_log_var)
z0 = z_mean + K.exp(z_log_var / 2) * epsilon
print('z0', z0)
return z0
def logdet_loss(args):
z0, w, u, b = args
b2 = K.squeeze(b, 1)
beta = K.sum(tf.multiply(w, z0), 1) # <w|z0>
linear_trans = beta + b2 # <w|z0> + b
# change u2 so that the transformation z0->z1 is invertible
alpha = K.sum(tf.multiply(w, u), 1) #
diag1 = tf.diag(K.softplus(alpha) - 1 - alpha)
u2 = u + K.dot(diag1, w) / K.sum(K.square(w)+1e-7)
gamma = K.sum(tf.multiply(w,u2), 1)
logdet = K.log(K.abs(1 + (1 - K.square(K.tanh(linear_trans)))*gamma) + 1e-6)
return logdet
def transform_z0(args):
z0, w, u, b = args
b2 = K.squeeze(b, 1)
beta = K.sum(tf.multiply(w, z0), 1)
# change u2 so that the transformation z0->z1 is invertible
alpha = K.sum(tf.multiply(w, u), 1)
diag1 = tf.diag(K.softplus(alpha) - 1 - alpha)
u2 = u + K.dot(diag1, w) / K.sum(K.square(w)+1e-7)
diag2 = tf.diag(K.tanh(beta + b2))
# generate z1
z1 = z0 + K.dot(diag2,u2)
return z1
Then here is the loss (where logdet is defined above)
def vae_loss(x, x_decoded_mean):
xent_loss = K.mean(objectives.categorical_crossentropy(x, x_decoded_mean), -1)
ln_q0z0 = K.sum(log_normal2(z0, z_mean, z_log_var, eps=1e-6), -1)
ln_pz1 = K.sum(log_stdnormal(z1), -1)
result = K.mean(logdet + ln_pz1 + xent_loss - ln_q0z0)
return result
I modified the Keras tutorial on VAE here : https://github.com/sbaurdlp/keras-iaf-mnist
If someone is interested to look...
Strangely adding more layers doesn't improve performance, and I can't see what is wrong in the code
As I couldn't make it work, I have tried to implement the normalizing flow described in this paper: Improved Variational Inference
with Inverse Autoregressive Flow
However I still meet the same problem of diverging loss (towards -infinity), which makes no sense. There must be a problem with my implementation.
Here are the important parts:
# the encoder
h = encoder_block(x) # a convnet taking proteins as input (matrices of size 400x22), I don't describe it since it isn't very important
z_log_var = Dense(latent_dim)(h)
z_mean = Dense(latent_dim)(h)
h_ = Dense(latent_dim)(h)
encoder = Model(x, [z_mean,z_log_var, h_])
# the latent variables (only one transformation to keep it simple)
latent_input = Input(shape=(latent_dim, 2), batch_shape=(batch_size, latent_dim, 2))
hl = Convolution1D(1, filter_length, activation="relu", border_mode="same")(latent_input)
hl = Reshape((latent_dim,))(hl)
mean_1 = Dense(latent_dim)(hl)
std_1 = Dense(latent_dim)(hl)
latent_model = Model(latent_input, [mean_1, std_1])
# the decoder
decoder_input = Input((latent_dim,), batch_shape=(batch_size, latent_dim))
decoder=decoder_block() # a convnet that I don't describe
x_decoded_mean = decoder(decoder_input)
generator = Model(decoder_input, x_decoded_mean)
# the VAE
z_mean, z_log_var, other = encoder(vae_input)
eps = Lambda(sample_eps, name='sample_eps')([z_mean, z_log_var, other])
z0 = Lambda(sample_z0, name='sample_z0')([z_mean, z_log_var, eps])
l = Lambda(sample_l, name='sample_l')([eps, z_log_var])
mean, std = latent_model(merge([Reshape((latent_dim,1))(z0), Reshape((latent_dim,1))(other)], mode="concat", concat_axis=-1))
z = Lambda(transform_z0)([z0, mean, std])
l = Lambda(transform_l)([l, std])
x_decoded_mean = generator(z)
vae = Model(vae_input, x_decoded_mean)
# and here is the loss
def vae_loss(x, x_decoded_mean):
xent_loss = K.mean(objectives.categorical_crossentropy(x, x_decoded_mean), -1)
ln_q0z0 = K.sum(log_normal2(z0, z_mean, z_log_var), -1)
ln_pz1 = K.sum(log_stdnormal(z), -1)
result = K.mean(l + ln_pz1 + xent_loss - ln_q0z0)
return result
Here are the utils functions I use above in the Lambda layers:
def sample_eps(args):
# sample epsilon according to N(O,I)
epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
std=epsilon_std)
return epsilon
def sample_z0(args):
z_mean, z_log_var, epsilon = args
# generate z0 according to N(z_mean, z_log_var)
z0 = z_mean + K.exp(z_log_var / 2) * epsilon
return z0
def sample_l(args):
epsilon, z_log_var = args
l = -0.5*K.sum(z_log_var + epsilon**2 + K.log(2*math.pi), -1)
return l
def transform_z0(args):
z0, mean, std = args
z = z0
sig_std = K.sigmoid(std)
z *= sig_std
z += (1-sig_std)*mean
return z
def transform_l(args):
l, std = args
sig_std = K.sigmoid(std)
l -= K.sum(K.log(sig_std+1e-8), -1)
return l
I've got a toy example set up of a linear regression model with one input variable and one output variable. The problem that I'm encountering is that the output for the bias is far off from the generated data. If I manually set the bias then it will produce a weight and bias which is close enough to the original.
I've written two pieces of code gen_data which generates data and GradientDescent which performs that gradient descent algorithm to find the weight and bias.
def gen_data(num_points=50, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points, 1))
y = np.zeros(shape=(num_points, 1))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) + np.random.normal(scale=3.0)
return (x, y)
# \mathbb{R}^1 with no regularization
def gradientDescent2(x, y, learning_rate=0.0001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
loss = (theta * x + bias) - y
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = np.mean(loss)
grad_t = np.mean(loss*x)
# updates
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
return (theta, bias)
If you want to use batch update, don't set your batch_size equals to your simple size. (I also believe that batch_update is not very suitable for this case.)
2.Your gradient calculation and parameter update are incorrect, the gradient should be:
grad_b = 1
grad_t = x
For the parameter update, you should always trying to minimize the loss, so it should be
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
After all, below is the modified code works well.
import numpy as np
import sys
def gen_data(num_points=500, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points))
y = np.zeros(shape=(num_points))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) #+ np.random.normal(scale=3.0)
#print('x:',x[i],' y:',y[i])
return (x, y)
def gradientDescent2(x, y, learning_rate=0.001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
for j in range(len(x)):
loss = (theta * x[j] + bias) - y[j]
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = 1
grad_t = x[j]
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
return (theta, bias)
def main():
x,y =gen_data()
ta,bias = gradientDescent2(x,y)
print('theta:',ta)
print('bias:',bias)
if __name__ == '__main__':
sys.exit(int(main() or 0))