Asymmetric loss function from pytorch to tensorflow - python

I am converting the asymmetric loss function from this paper ASL which is written in pytorch here: ASL github.
The pytorch code looks like this:
class AsymmetricLoss(nn.Module):
def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=True):
super(AsymmetricLoss, self).__init__()
self.gamma_neg = gamma_neg
self.gamma_pos = gamma_pos
self.clip = clip
self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
self.eps = eps
def forward(self, x, y):
""""
Parameters
----------
x: input logits
y: targets (multi-label binarized vector)
"""
# Calculating Probabilities
x_sigmoid = torch.sigmoid(x)
xs_pos = x_sigmoid
xs_neg = 1 - x_sigmoid
# Asymmetric Clipping
if self.clip is not None and self.clip > 0:
xs_neg = (xs_neg + self.clip).clamp(max=1)
# Basic CE calculation
los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
loss = los_pos + los_neg
# Asymmetric Focusing
if self.gamma_neg > 0 or self.gamma_pos > 0:
if self.disable_torch_grad_focal_loss:
torch.set_grad_enabled(False)
pt0 = xs_pos * y
pt1 = xs_neg * (1 - y) # pt = p if t > 0 else 1-p
pt = pt0 + pt1
one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
one_sided_w = torch.pow(1 - pt, one_sided_gamma)
if self.disable_torch_grad_focal_loss:
torch.set_grad_enabled(True)
loss *= one_sided_w
return -loss.sum()
And what I have changed it to in tensorflow:
class AsymmetricLoss(tf.keras.losses.Loss):
def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_grad_focal_loss=False):
super(AsymmetricLoss, self).__init__()
self.gamma_neg = gamma_neg
self.gamma_pos = gamma_pos
self.clip = clip
self.eps = eps
self.disable_grad_focal_loss = disable_grad_focal_loss
def call(self, y_true, y_pred):
y_pred_sigmoid = tf.sigmoid(y_pred)
y_preds_pos = y_pred_sigmoid
y_preds_neg = 1 - y_pred_sigmoid
y_true = tf.cast(y_true, tf.float32)
if self.clip is not None and self.clip > 0:
y_preds_neg = tf.clip_by_value(y_preds_neg + self.clip,clip_value_max=1, clip_value_min=-np.inf)
print(y_preds_neg.dtype)
print(y_preds_pos.dtype)
print(y_true.dtype)
los_pos = y_true * tf.math.log(tf.clip_by_value(y_preds_pos, clip_value_min=self.eps, clip_value_max=np.inf))
los_neg = (1-y_true) * tf.math.log(tf.clip_by_value(y_preds_neg, clip_value_min=self.eps, clip_value_max=np.inf))
loss = los_pos + los_neg
if self.gamma_neg > 0 or self.gamma_pos > 0:
if self.disable_grad_focal_loss:
#torch.set_set_grad_enable(False)
pass
pt0 = y_preds_pos * y_true
pt1 = y_preds_neg * (1-y_true)
pt = pt0 + pt1
one_sided_gamma = self.gamma_pos * y_true + self.gamma_neg *(1-y_true)
one_sided_w = tf.math.pow(1-pt, one_sided_gamma)
if self.disable_grad_focal_loss:
#torch.set_set_grad_enable(True)
pass
loss *= one_sided_w
y_true = tf.cast(y_true, tf.int64)
return -tf.math.reduce_sum(loss)
There is especially two things I am unsure about.
In the pytorch code the turns on and off the grad with torch.set_grad_enable(bool) which I can not find an equliant in tensorflow (tf.stop_gradient, doesnt seem to be the same). I have also read that the grad is not calculated in the call method so it doesnt matter, but this I am not sure about.
The second thing is the sum in the return. Is so that tensorflow sums up the loss by it self so its not correct to have it in the loss function?

Related

how can i make target size equals input size in my DQN code?

everyone!When I was doing dqn programming, I encountered some problems. This error says
“ Userwarning: Using a target size (torch.Size([32,32])) that is different to the input size (torch.Size([32,1])).This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input,target,reduction=self.reduction)"
And I don't know where the mistake is because I am new to RL . And some of these codes are borrowed from other people's codes, so I don't understand some places.
here are codes:
# hyperparameters
gamma = 0.9
TARGET_REPLACE_ITER = 20
memory_capability = 100
batch_size = 32
learning_rate = 0.001
n_state = 5
n_action = 32
neural network code:
class NN(nn.Module):
def __init__(self, ):
super(NN,self).__init__()
self.fc1 = nn.Linear(n_state, 32)
self.fc1.weight.data.normal_(0, 0.1)
self.fc2 = nn.Linear(32,64)
self.out = nn.Linear(64, n_action)
self.out.weight.data.normal_(0, 0.1)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
agent code:
class Agent(object):
def __init__(self,):
self.learn_step_counter = 0
self.memory = np.zeros((memory_capability, n_state * 2 + 2))
self.memory_cntr = 0
self.eval_net, self.target_net = NN(), NN()
self.loss_func = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
def choose_action(self, state):
state = torch.unsqueeze(torch.FloatTensor(state),0) # state is 1-Dim np.array,shape = (5,)
if random.random() < epsilon:
action = random.randint(0,len(stringlist) - 1)
else:
action_value = self.eval_net.forward(state)
action = torch.max(action_value, 1)[1].numpy()[0]
return action
def learn(self):
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter += 1
sample_index = np.random.choice(memory_capability, batch_size)
b_memory = self.memory[sample_index, :]
b_s = torch.FloatTensor(b_memory[:, :n_state])
b_a = torch.LongTensor(b_memory[:, n_state:n_state + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, n_state + 1:n_state + 2])
b_s_ = torch.FloatTensor(b_memory[:, -n_state:])
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach()
q_target = b_r + gamma * q_next.max(1)[0] # other people's code said the shape is (batch, 1)=(32,1),but when i ran ,it was (batch,batch)=(32,32),i don't know why
loss = self.loss_func(q_eval, q_target)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def store_transition(self,state,action,reward,state_):
transition = np.hstack((state,action,reward,state_))
index = self.memory_cntr % memory_capability
self.memory[index,:] = transition
self.memory_cntr += 1
the problem is probably in learn(),but i don't know how to modify.I will appreciate it if someone can help me,thanks a lot
The bug is exactly at the line you pointed out:
q_target = b_r + gamma * q_next.max(1)[0]
Here q_next is of shape [batch_size, n_action], so q_next.max(1)[0] is of shape [batch_size]. We also have b_r with a shape of [batch_size,1]. Now adding those two entities will not throw an error as PyTorch is doing some automatic shape broadcasting. So the fix for this is to reshape b_r to [batch_size] from [batch_size,1] by using b_r.squeeze(1)

My Standard Neural Network Cost is Going Up

I have been trying to create a simple standard neural network from scratch but I can't seem to get it to work normally. Sometimes the cost skyrockets, other times the cost doesn't even change. I'm not sure what the problem is but it would be really helpful if someone could help me.
I have all of the information on Github. If any more information is needed kindly reply and I will provide it.
https://github.com/enriquedellxps/Neural-Network
Function for generating batches:
def batcher(data, batch_size):
# get the number of batches
num_batches_norem = data.shape[1] // batch_size
if data.shape[1] % batch_size == 0:
remainder_quantity = 0
else:
remainder_size = data.shape[1] % batch_size
remainder_quantity = 1
num_batches = num_batches_norem + remainder_quantity
changer = 0
for mb in range(num_batches_norem):
current_batch = data[:, changer:changer + batch_size]
changer += batch_size
yield current_batch
for last_mb in range(remainder_quantity):
last_batch = data[:, changer:changer + remainder_size]
yield last_batch
Function for g(z):
def activationer(a, z):
# ACTIVATION FUNCTIONS
# Sigmoid Activation Function
def sigmoid(z):
g = scipy.special.expit(z)
return g
# Tanh (Hyperbolic Tangent Function) Activation Function
def tanh(z):
g = (np.exp(z) - np.exp(-1 * z)) / ((np.exp(z) + np.exp(-1 * z)))
return g
# ReLU (Rectified Linear Unit) Activation Function
def ReLU(z):
g = np.maximum(0, z)
return g
# Leaky ReLU (Leaky Rectified Linear Unit) Activation Function
def Leaky_ReLU(z):
g = np.maximum(0.01 * z, z)
return g
def softmax(z):
z_exp = np.exp(z)
g = z_exp / np.sum(z_exp, axis=0, keepdims=True)
return g
if a == "sigmoid":
res = sigmoid(z)
elif a == "tanh":
res = tanh(z)
elif a == "relu":
res = ReLU(z)
elif a == "leaky relu":
res = Leaky_ReLU(z)
elif a == "softmax":
res = softmax(z)
return res
NN Class:
class DeepNeuralNetwork:
def __init__(self, n_x, n_h, n_y, nl, activations, alpha):
assert nl == len(activations), f"L: {nl}, Number of Activations: {len(activations)}"
# Assign inputs to the self object
self.n_x = n_x
self.n_h = n_h
self.n_y = n_y
self.nl = nl
self.activations = activations
self.alpha = alpha
# Initialize Parameters
def initialize_parameters(self):
n_x = self.n_x
n_h = self.n_h
n_y = self.n_y
activations = self.activations
parameters = []
for l in range(self.nl):
np.random.seed(8)
if l == 0:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(2 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | Xavier
else:
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(1 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | He
elif l == self.nl - 1:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(2 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | Xavier
else:
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(1 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | He
else:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(2 / n_h), np.zeros((n_h, 1))]) # hidden params | Xavier
else:
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(1 / n_h), np.zeros((n_h, 1))]) # hidden params | He
return parameters
# Forward Propagation
def forward_propagation(self, parameters, input_data):
batch_size = input_data.shape[1] # Get the amount of examples in the batch
caches = []
self.caches = caches
current_activation = input_data # Set first activation - A0 - as the input
caches.append(current_activation)
for l in range(self.nl):
W, b = parameters[l][0], parameters[l][1] # Get weights and biases for current layer
Z = W # current_activation + b # Compute the linear activation
current_activation = activationer(self.activations[l], Z) # Compute the full activation
caches.append(current_activation)
return current_activation
# Compute Cost
def compute_cost(self, yhat, y):
batch_size = y.shape[1] # Get the amount of examples in the batch
cost = (1 / batch_size) * np.sum(-1 * (y * np.log(yhat) + ((1 - y) * (np.log(1 - yhat))))) # Compute the cross-entropy cost
cost = np.squeeze(cost) # Turn [[17]] to 17
return cost
# Backward Propagation
def backward_propagation(self, parameters, y):
caches = self.caches
batch_size = y.shape[1]
grads = []
for l in reversed(range(1, self.nl + 1)):
if l == self.nl:
dZ = caches[l] - y
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
else:
dA = parameters[l][0].T # dZ
dZ = dA * np.multiply(caches[l], (1 - caches[l]))
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
return grads
# Update Parameters
def update_parameters(self, parameters, gradients):
for l in range(self.nl):
parameters[l][0] = parameters[l][0] - self.alpha * gradients[self.nl - l - 1][0]
parameters[l][1] = parameters[l][1] - self.alpha * gradients[self.nl - l - 1][1]
return parameters
Running it:
dnn = DeepNeuralNetwork(12288, 20, 1, 4, ["relu", "relu", "relu", "sigmoid"], 0.001)
params = dnn.initialize_parameters()
epochs = 100
for e in range(epochs):
for i, j in zip(train_x_batched, train_y_batched):
yhat = dnn.forward_propagation(params, i)
cost = dnn.compute_cost(yhat, j)
grads = dnn.backward_propagation(params, j)
params = update_parameters(params, grads)
print(cost) # This usually starts going down then skyrockets. Even if I lower the learning rate to 0.00001
Thanks :)

Neural Network from scratch does not converge

I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#

XOR converges to 0.5 in Neural Network

Currently I am trying to implement a three layer network (2 hidden layers and 1 output layer) and having this network classify XOR. I am having an issue where my output layer will always converge to around 0.5 - I cannot figure out why this is happening and would like some guidance as to why.
import numpy as np
from matplotlib import pyplot as plt
# XOR
x=np.array(([0,0],[0,1],[1,0],[1,1]), dtype=float)
y=np.array(([0],[1],[1],[0]), dtype=float)
class NN:
def __init__(self, x, y):
self.weights1 = np.random.uniform(-0.5, 0.5, (2,3))
self.weights2 = np.random.uniform(-0.5, 0.5, (3,4))
self.weights3 = np.random.uniform(-0.5, 0.5, (4,1))
self.output = np.zeros(1)
def forward_prop(self, training_data):
self.layer1 = logistic_function(np.dot(training_data, self.weights1))
self.layer2 = logistic_function(np.dot(self.layer1, self.weights2))
self.output = logistic_function(np.dot(self.layer2, self.weights3))
return self.output
def back_prop(self, training_data, test_data):
self.delta = loss_function(test_data, self.output) * logistic_deriv(self.output)
self.e_weights3 = self.delta.dot(self.weights3.T)
self.d_weights3 = self.e_weights3 * logistic_deriv(self.layer2)
self.e_weights2 = self.d_weights3.dot(self.weights2.T)
self.d_weights2 = self.e_weights2 * logistic_deriv(self.layer1)
self.e_weights1 = self.d_weights2.dot(self.weights1.T)
self.d_weights1 = self.e_weights1 * logistic_deriv(training_data)
self.weights1 -= 0.01 * training_data.T.dot(self.d_weights1)
self.weights2 -= 0.01 * self.layer1.T.dot(self.d_weights2)
self.weights3 -= 0.01 * self.layer2.T.dot(self.d_weights3)
# Activation function
def logistic_function(z):
return 1.0 / (1.0 + np.exp(-z))
# Derivative function
def logistic_deriv(z):
return logistic_function(z) * (1.0 - logistic_function(z))
# Squared loss function
def loss_function(target_y, output_y):
loss = target_y - output_y
return 0.5 * np.power(loss,2)
network = NN(x, y)
for i in range(1000):
for j in range(0,len(x)):
network.forward_prop(x[j])
network.back_prop(x[j],y[j])
print(network.forward_prop(x[0]))
print(network.forward_prop(x[1]))
print(network.forward_prop(x[2]))
print(network.forward_prop(x[3]))

I have written a neural network model, but it's accuracy only increases when the weights take certain values

So basically, if the weights, which are initially assigned randomly, are within a range, of about 0.4 to -0.4 for w0 for example, then they will change and the accuracy will improve. However, if the weights are assigned a random number that is outside this range, then they won't be changed at all. I can't figure it out. Any suggestions would be great :)
import numpy as np
from matplotlib import pyplot as plt
class NN_model:
def __init__(self, data):
self.data = data
self.w0 = np.random.randn()
self.w1 = np.random.randn()
self.bias = np.random.randn()
self.trained = False
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_p(self, x):
return self.sigmoid(x) * (1 - self.sigmoid(x))
def calculate_loss(self, pred, target):
loss = np.square(target - pred)
return loss
def training_loop(self, data):
costs = []
learning_rate = 0.005
print(self.w0)
if self.trained == True:
print('Model already trained')
pass
else:
for i in range(50):
for i in range(100000):
ri = np.random.randint(len(data))
point = data[ri]
sig_out = ((self.w0 * point[0]) + (self.w1 * point[1]) + self.bias)
pred = self.sigmoid(sig_out)
cost = self.calculate_loss(pred, point[2])
costs.append(cost)
dcost_dpred = 2 * (pred - point[2])
dpred_dsigout = self.sigmoid_p(sig_out)
dsigout_dw0 = point[0]
dsigout_dw1 = point[1]
dsigout_dbias = 1
dcost_dw0 = dcost_dpred * dpred_dsigout * dsigout_dw0
dcost_dw1 = dcost_dpred * dpred_dsigout * dsigout_dw1
dcost_dbias = dcost_dpred * dpred_dsigout * dsigout_dbias
self.w0 += - learning_rate * dcost_dw0
self.w1 += - learning_rate * dcost_dw0
self.bias += - learning_rate * dcost_dbias
print(self.w0, self.w1, self.bias)
#-0.0752623452445784 0.2447376547554179 4.032995041915469
#-0.3042823068224879 0.015717693177505765 18.643149928253827
self.trained = True
plt.plot(costs)
plt.show()
def predict(self, test_data):
if self.trained == True:
pred = self.sigmoid( (test_data[0] * self.w0) + (test_data[1] * self.w1) + self.bias )
print(pred)
if pred > 0.5:
print('Woman')
else:
print('Man')
else:
print('Error: Model has not been trained yet')
You have to have intuitive understanding of how the neural networks work behind the scene. There is no rule of thumb to decide the best values for your weights.
It might be possible that your initial weights pulls your near to accuracy really fast but in your example it might seem that while taking weights in a wide range with learning rate = 0.005, it will take some time until your weights are reduced. Therefore learning rates also play crucial role. Try tweaking that as well.

Categories

Resources