This is a simple MLP I am writing for binary image classification, with backpropagation:
class MLP:
def __init__(self, size, epochs = 1000, learning_rate = 1):
self.l1weights = numpy.random.random((size + 1, 3))
self.l2weights = numpy.random.random(3)
self.epochs = epochs
self.learning_rate = learning_rate
def predict(self, _input_):
#Append bias at the beginning of input
l1output = self.sigmoid(numpy.dot(numpy.append([1], _input_), self.l1weights))
l2output = self.sigmoid(numpy.dot(l1output, self.l2weights))
return l1output, l2output
def train(self, training_set, training_goal):
for epoch in range(self.epochs):
l1squared_error = 0
l2squarederror = 0
for set_index in range(training_goal.shape[0]):
set = training_set[set_index]
l1output, l2output = self.predict(set)
l2error = training_goal[set_index] - l2output
l1error = l2error * self.dsigmoid(l2output) * self.l2weights
self.l1weights[0] = self.l1weights[0] + self.learning_rate * l1error
for index in range(len(self.l1weights) - 1):
self.l1weights[index + 1] += self.learning_rate * l1error * self.dsigmoid(l1output)
for index in range(len(self.l2weights)):
self.l2weights[index] += self.learning_rate * l2error * self.dsigmoid(l2output)
l1squared_error += sum(l1error ** 2)
l2squarederror += l2error ** 2
print("Squared error at epoch " + str(epoch) + " : " + str(l1squared_error) + ", " + str(l2squarederror))
def sigmoid(self, _input_):
#Sigmoid sigmoid function
return 1 / (1 + numpy.exp(-_input_))
def dsigmoid(self, _input_):
return _input_ * (1 - _input_)
When run sometimes all output converges into 1 but for some reason the predictions for 0 converge into 0.5 while predictions for 1 stay near 0.75, with error from layer 2 staying the same after ~1000 epochs, if it does relatively more successfully. This is from testing with 2x2 image classification with the code below:
def image_class(input):
return 1 if input >= 2 else 0
training_set = ((numpy.arange(2**4)[:,None] & (1 << numpy.arange(4))) != 0)
training_goals = numpy.array([image_class(sum(i)) for i in training_set])
mlp = MLP(size=4)
mlp.train(training_set, training_goals)
I could solve this by adding a layer right after the output layer with step activation instead of sigmoid and training it separately from the initial network, at least with 2x2 recognition.
Related
I am converting the asymmetric loss function from this paper ASL which is written in pytorch here: ASL github.
The pytorch code looks like this:
class AsymmetricLoss(nn.Module):
def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=True):
super(AsymmetricLoss, self).__init__()
self.gamma_neg = gamma_neg
self.gamma_pos = gamma_pos
self.clip = clip
self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
self.eps = eps
def forward(self, x, y):
""""
Parameters
----------
x: input logits
y: targets (multi-label binarized vector)
"""
# Calculating Probabilities
x_sigmoid = torch.sigmoid(x)
xs_pos = x_sigmoid
xs_neg = 1 - x_sigmoid
# Asymmetric Clipping
if self.clip is not None and self.clip > 0:
xs_neg = (xs_neg + self.clip).clamp(max=1)
# Basic CE calculation
los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
loss = los_pos + los_neg
# Asymmetric Focusing
if self.gamma_neg > 0 or self.gamma_pos > 0:
if self.disable_torch_grad_focal_loss:
torch.set_grad_enabled(False)
pt0 = xs_pos * y
pt1 = xs_neg * (1 - y) # pt = p if t > 0 else 1-p
pt = pt0 + pt1
one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
one_sided_w = torch.pow(1 - pt, one_sided_gamma)
if self.disable_torch_grad_focal_loss:
torch.set_grad_enabled(True)
loss *= one_sided_w
return -loss.sum()
And what I have changed it to in tensorflow:
class AsymmetricLoss(tf.keras.losses.Loss):
def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_grad_focal_loss=False):
super(AsymmetricLoss, self).__init__()
self.gamma_neg = gamma_neg
self.gamma_pos = gamma_pos
self.clip = clip
self.eps = eps
self.disable_grad_focal_loss = disable_grad_focal_loss
def call(self, y_true, y_pred):
y_pred_sigmoid = tf.sigmoid(y_pred)
y_preds_pos = y_pred_sigmoid
y_preds_neg = 1 - y_pred_sigmoid
y_true = tf.cast(y_true, tf.float32)
if self.clip is not None and self.clip > 0:
y_preds_neg = tf.clip_by_value(y_preds_neg + self.clip,clip_value_max=1, clip_value_min=-np.inf)
print(y_preds_neg.dtype)
print(y_preds_pos.dtype)
print(y_true.dtype)
los_pos = y_true * tf.math.log(tf.clip_by_value(y_preds_pos, clip_value_min=self.eps, clip_value_max=np.inf))
los_neg = (1-y_true) * tf.math.log(tf.clip_by_value(y_preds_neg, clip_value_min=self.eps, clip_value_max=np.inf))
loss = los_pos + los_neg
if self.gamma_neg > 0 or self.gamma_pos > 0:
if self.disable_grad_focal_loss:
#torch.set_set_grad_enable(False)
pass
pt0 = y_preds_pos * y_true
pt1 = y_preds_neg * (1-y_true)
pt = pt0 + pt1
one_sided_gamma = self.gamma_pos * y_true + self.gamma_neg *(1-y_true)
one_sided_w = tf.math.pow(1-pt, one_sided_gamma)
if self.disable_grad_focal_loss:
#torch.set_set_grad_enable(True)
pass
loss *= one_sided_w
y_true = tf.cast(y_true, tf.int64)
return -tf.math.reduce_sum(loss)
There is especially two things I am unsure about.
In the pytorch code the turns on and off the grad with torch.set_grad_enable(bool) which I can not find an equliant in tensorflow (tf.stop_gradient, doesnt seem to be the same). I have also read that the grad is not calculated in the call method so it doesnt matter, but this I am not sure about.
The second thing is the sum in the return. Is so that tensorflow sums up the loss by it self so its not correct to have it in the loss function?
everyone!When I was doing dqn programming, I encountered some problems. This error says
“ Userwarning: Using a target size (torch.Size([32,32])) that is different to the input size (torch.Size([32,1])).This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input,target,reduction=self.reduction)"
And I don't know where the mistake is because I am new to RL . And some of these codes are borrowed from other people's codes, so I don't understand some places.
here are codes:
# hyperparameters
gamma = 0.9
TARGET_REPLACE_ITER = 20
memory_capability = 100
batch_size = 32
learning_rate = 0.001
n_state = 5
n_action = 32
neural network code:
class NN(nn.Module):
def __init__(self, ):
super(NN,self).__init__()
self.fc1 = nn.Linear(n_state, 32)
self.fc1.weight.data.normal_(0, 0.1)
self.fc2 = nn.Linear(32,64)
self.out = nn.Linear(64, n_action)
self.out.weight.data.normal_(0, 0.1)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
agent code:
class Agent(object):
def __init__(self,):
self.learn_step_counter = 0
self.memory = np.zeros((memory_capability, n_state * 2 + 2))
self.memory_cntr = 0
self.eval_net, self.target_net = NN(), NN()
self.loss_func = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
def choose_action(self, state):
state = torch.unsqueeze(torch.FloatTensor(state),0) # state is 1-Dim np.array,shape = (5,)
if random.random() < epsilon:
action = random.randint(0,len(stringlist) - 1)
else:
action_value = self.eval_net.forward(state)
action = torch.max(action_value, 1)[1].numpy()[0]
return action
def learn(self):
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter += 1
sample_index = np.random.choice(memory_capability, batch_size)
b_memory = self.memory[sample_index, :]
b_s = torch.FloatTensor(b_memory[:, :n_state])
b_a = torch.LongTensor(b_memory[:, n_state:n_state + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, n_state + 1:n_state + 2])
b_s_ = torch.FloatTensor(b_memory[:, -n_state:])
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach()
q_target = b_r + gamma * q_next.max(1)[0] # other people's code said the shape is (batch, 1)=(32,1),but when i ran ,it was (batch,batch)=(32,32),i don't know why
loss = self.loss_func(q_eval, q_target)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def store_transition(self,state,action,reward,state_):
transition = np.hstack((state,action,reward,state_))
index = self.memory_cntr % memory_capability
self.memory[index,:] = transition
self.memory_cntr += 1
the problem is probably in learn(),but i don't know how to modify.I will appreciate it if someone can help me,thanks a lot
The bug is exactly at the line you pointed out:
q_target = b_r + gamma * q_next.max(1)[0]
Here q_next is of shape [batch_size, n_action], so q_next.max(1)[0] is of shape [batch_size]. We also have b_r with a shape of [batch_size,1]. Now adding those two entities will not throw an error as PyTorch is doing some automatic shape broadcasting. So the fix for this is to reshape b_r to [batch_size] from [batch_size,1] by using b_r.squeeze(1)
I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#
So basically, if the weights, which are initially assigned randomly, are within a range, of about 0.4 to -0.4 for w0 for example, then they will change and the accuracy will improve. However, if the weights are assigned a random number that is outside this range, then they won't be changed at all. I can't figure it out. Any suggestions would be great :)
import numpy as np
from matplotlib import pyplot as plt
class NN_model:
def __init__(self, data):
self.data = data
self.w0 = np.random.randn()
self.w1 = np.random.randn()
self.bias = np.random.randn()
self.trained = False
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_p(self, x):
return self.sigmoid(x) * (1 - self.sigmoid(x))
def calculate_loss(self, pred, target):
loss = np.square(target - pred)
return loss
def training_loop(self, data):
costs = []
learning_rate = 0.005
print(self.w0)
if self.trained == True:
print('Model already trained')
pass
else:
for i in range(50):
for i in range(100000):
ri = np.random.randint(len(data))
point = data[ri]
sig_out = ((self.w0 * point[0]) + (self.w1 * point[1]) + self.bias)
pred = self.sigmoid(sig_out)
cost = self.calculate_loss(pred, point[2])
costs.append(cost)
dcost_dpred = 2 * (pred - point[2])
dpred_dsigout = self.sigmoid_p(sig_out)
dsigout_dw0 = point[0]
dsigout_dw1 = point[1]
dsigout_dbias = 1
dcost_dw0 = dcost_dpred * dpred_dsigout * dsigout_dw0
dcost_dw1 = dcost_dpred * dpred_dsigout * dsigout_dw1
dcost_dbias = dcost_dpred * dpred_dsigout * dsigout_dbias
self.w0 += - learning_rate * dcost_dw0
self.w1 += - learning_rate * dcost_dw0
self.bias += - learning_rate * dcost_dbias
print(self.w0, self.w1, self.bias)
#-0.0752623452445784 0.2447376547554179 4.032995041915469
#-0.3042823068224879 0.015717693177505765 18.643149928253827
self.trained = True
plt.plot(costs)
plt.show()
def predict(self, test_data):
if self.trained == True:
pred = self.sigmoid( (test_data[0] * self.w0) + (test_data[1] * self.w1) + self.bias )
print(pred)
if pred > 0.5:
print('Woman')
else:
print('Man')
else:
print('Error: Model has not been trained yet')
You have to have intuitive understanding of how the neural networks work behind the scene. There is no rule of thumb to decide the best values for your weights.
It might be possible that your initial weights pulls your near to accuracy really fast but in your example it might seem that while taking weights in a wide range with learning rate = 0.005, it will take some time until your weights are reduced. Therefore learning rates also play crucial role. Try tweaking that as well.
Here below is a piece of code for realizing a 2-layer neuron network for fitting problem in numpy. The activatin function is ReLU. The training algorithm is Adam. The loss function is half of the mean squared error. However, when the batch size is large(e.g. 10000), the loss will become nan after some iterations. The problem won't happen for small batch size. Could anyone help me explain why this may happen?(data are from matlab workspace:6_final_mapping_pos.mat)
#
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = sio.loadmat('6_final_mapping_pos.mat')
class NeuralNetwork():
def __init__(self):
self.batch_size = 256
self.input_size = 5 # input dimension is 5
self.hidden_layer1_size = 50
self.output_size = 1 # output dimension is 5
self.train_data = data['training_data_pos']
self.df_traindata = pd.DataFrame(data=self.train_data)
self.validation_data_num = 17142
self.valid_data = data['validation_data_pos']
self.df_validdata = pd.DataFrame(data=self.valid_data)
# weight initialization for ReLu
self.W1 = np.random.randn(self.input_size, self.hidden_layer1_size)/ np.sqrt(self.input_size/2)
self.W2 = np.random.randn(self.hidden_layer1_size, self.output_size)/ np.sqrt(self.hidden_layer1_size/2)
#bias initialization
self.b1 = np.zeros((1,self.hidden_layer1_size))
self.b2 = np.zeros((1,self.output_size))
self.lr = 5e-3 # learning rate
self.reg = 1e-3 # regularization strength
self.p = 0.5 # dropout probability = 1-p
self.first_moment_W3=0
self.second_moment_W3=0
self.first_moment_W2=0
self.second_moment_W2=0
self.first_moment_W1=0
self.second_moment_W1=0
self.first_moment_b3=0
self.second_moment_b3=0
self.first_moment_b2=0
self.second_moment_b2=0
self.first_moment_b1=0
self.second_moment_b1=0
def feedforward(self):
### randomly selected mini-batch as inputs
self.df_sample_t = self.df_traindata.sample(n = self.batch_size)
self.train_input = self.df_sample_t.as_matrix(columns=[0,1,2,3,4])
self.train_output = self.df_sample_t.as_matrix(columns=[5])
#hidden layer with dropput technique
self.hidden_layer1 = np.maximum(0, (np.dot(self.train_input, self.W1) + self.b1))
U1= np.random.rand(*self.hidden_layer1.shape) < self.p # drop mask
self.hidden_layer1 *= U1 # drop!
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.train_output)**2) / self.batch_size
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
def backpropagation(self):
self.d_output = (self.output_layer-self.train_output)/ self.batch_size
#data part
self.dW2 = np.dot(self.hidden_layer1.T, self.d_output)
self.db2 = np.sum(self.d_output, axis=0, keepdims=True)
self.dhidden1 = np.dot(self.d_output, self.W2.T)
self.dhidden1[self.hidden_layer1<= 0] = 0
self.dW1 = np.dot(self.train_input.T, self.dhidden1)
self.db1 = np.sum(self.dhidden1, axis=0, keepdims=True)
#regularization part
self.dW2 = self.dW2 + self.reg * self.W2
self.dW1 = self.dW1 + self.reg * self.W1
def Adam(self, epoch, dW2, dW1, db2, db1):
beta1 = 0.9
beta2 = 0.99
self.first_moment_W2 = beta1*self.first_moment_W2 + (1-beta1)*dW2
self.second_moment_W2 = beta2*self.second_moment_W2 + (1-beta2)*dW2*dW2
first_unbias_W2 = self.first_moment_W2 /(1-beta1 ** epoch)
second_unbias_W2 = self.second_moment_W2 /(1-beta2 ** epoch)
self.W2 -= self.lr * first_unbias_W2 / (np.sqrt(second_unbias_W2) +1e-7)
self.first_moment_W1 = beta1*self.first_moment_W1 + (1-beta1)*dW1
self.second_moment_W1 = beta2*self.second_moment_W1 + (1-beta2)*dW1*dW1
first_unbias_W1 = self.first_moment_W1 /(1-beta1 ** epoch)
second_unbias_W1 = self.second_moment_W1 /(1-beta2 ** epoch)
self.W1 -= self.lr * first_unbias_W1 / (np.sqrt(second_unbias_W1) +1e-7)
self.first_moment_b2 = beta1*self.first_moment_b2 + (1-beta1)*db2
self.second_moment_b2 = beta2*self.second_moment_b2 + (1-beta2)*db2*db2
first_unbias_b2 = self.first_moment_b2 /(1-beta1 ** epoch)
second_unbias_b2 = self.second_moment_b2 /(1-beta2 ** epoch)
self.b2 -= self.lr * first_unbias_b2 / (np.sqrt(second_unbias_b2) +1e-7)
self.first_moment_b1 = beta1*self.first_moment_b1 + (1-beta1)*db1
self.second_moment_b1 = beta2*self.second_moment_b1 + (1-beta2)*db1*db1
first_unbias_b1 = self.first_moment_b1 /(1-beta1 ** epoch)
second_unbias_b1 = self.second_moment_b1 /(1-beta2 ** epoch)
self.b1 -= self.lr * first_unbias_b1 / (np.sqrt(second_unbias_b1) +1e-7)
def validation(self):
self.df_sample_v = self.df_validdata.sample(n = self.validation_data_num)
self.valid_input = self.df_sample_v.as_matrix(columns=[0,1,2,3,4])
self.valid_output = self.df_sample_v.as_matrix(columns=[5])
self.hidden_layer1 = np.maximum(0, np.dot(self.valid_input, self.W1) + self.b1) *self.p
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.valid_output)**2) / self.validation_data_num
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
NN = NeuralNetwork()
num_iterations = 120
training_loss = np.array([])
validation_loss = np.array([])
validation_dataloss = np.array([])
t=1
T=np.array([range(1,num_iterations)]).T
# Training and validation
while(t < num_iterations):
NN.feedforward()
NN.backpropagation()
NN.Adam(t, NN.dW2, NN.dW1, NN.db2, NN.db1)
training_loss = np.append(training_loss, NN.total_loss)
if t % 10 == 0:
print ("training:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
NN.validation()
validation_loss = np.append(validation_loss, NN.total_loss)
validation_dataloss = np.append(validation_dataloss, NN.data_loss)
if t % 10 == 0:
print ("validation:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
t+=1